From f4215b1375e268155cf6f769d976617f872b938f Mon Sep 17 00:00:00 2001
From: lvyufeng <lvyufeng@cqu.edu.cn>
Date: Wed, 9 Jul 2025 20:30:46 +0800
Subject: [PATCH] use huggingface transformers official ut to test

---
 .github/workflows/ci_pipeline.yaml            |    4 +-
 .gitignore                                    |    4 +-
 mindnlp/__init__.py                           |    4 -
 mindnlp/core/__init__.py                      |   15 +-
 mindnlp/core/_tensor.py                       |   53 +
 mindnlp/core/_utils.py                        |   61 +-
 mindnlp/core/distributed/fsdp/__init__.py     |    3 +-
 .../fsdp/fully_sharded_data_parallel.py       |    2 +
 mindnlp/core/fx/__init__.py                   |    5 +
 .../core/fx/_compatibility.py                 |    0
 mindnlp/core/nn/modules/module.py             |  144 +-
 mindnlp/core/nn/parameter.py                  |   27 +
 mindnlp/core/serialization.py                 |   58 +-
 mindnlp/transformers/__init__.py              |    1 +
 mindnlp/utils/safetensors_patch.py            |    1 -
 mindnlp/utils/torch_proxy.py                  |   12 +-
 tests/peft/test_config.py                     |    0
 tests/run_test.py                             |   39 +
 tests/transformers/__init__.py                |    0
 tests/transformers/generation/__init__.py     |    0
 .../generation/test_framework_agnostic.py     |  605 ---
 tests/transformers/generation/test_utils.py   | 3289 --------------
 tests/transformers/models/__init__.py         |    0
 tests/transformers/models/albert/__init__.py  |    0
 .../models/albert/test_modeling_albert.py     |  340 --
 .../models/albert/test_tokenization_albert.py |  132 -
 tests/transformers/models/align/__init__.py   |    0
 .../models/align/test_modeling_align.py       |  571 ---
 tests/transformers/models/altclip/__init__.py |    0
 .../models/altclip/test_modeling_altclip.py   |  535 ---
 .../audio_spectrogram_transformer/__init__.py |    0
 ..._modeling_audio_spectrogram_transformer.py |  260 --
 tests/transformers/models/auto/__init__.py    |    0
 .../models/auto/test_configuration_auto.py    |   89 -
 .../models/auto/test_modeling_auto.py         |  367 --
 .../models/autoformer/__init__.py             |    0
 .../autoformer/test_modeling_autoformer.py    |  489 --
 .../transformers/models/baichuan/__init__.py  |    0
 tests/transformers/models/bark/__init__.py    |    0
 .../models/bark/test_modeling_bark.py         | 1169 -----
 tests/transformers/models/bart/__init__.py    |    0
 .../models/bart/test_modeling_bart.py         | 1512 -------
 tests/transformers/models/beit/__init__.py    |    0
 .../models/beit/test_modeling_beit.py         |  508 ---
 tests/transformers/models/bert/__init__.py    |    0
 .../models/bert/test_modeling_bert.py         |  695 ---
 .../models/bert/test_tokenization_bert.py     |  343 --
 .../models/bert_generation/__init__.py        |    0
 .../test_modeling_bert_generation.py          |  335 --
 .../transformers/models/big_bird/__init__.py  |    0
 .../models/big_bird/test_modeling_big_bird.py |  943 ----
 .../models/bigbird_pegasus/__init__.py        |    0
 .../test_modeling_bigbird_pegasus.py          |  816 ----
 tests/transformers/models/biogpt/__init__.py  |    0
 .../models/biogpt/test_modeling_biogpt.py     |  455 --
 tests/transformers/models/bit/__init__.py     |    0
 .../models/bit/test_modeling_bit.py           |  307 --
 .../models/blenderbot/__init__.py             |    0
 .../blenderbot/test_modeling_blenderbot.py    |  568 ---
 .../models/blenderbot_small/__init__.py       |    0
 .../test_modeling_blenderbot_small.py         |  571 ---
 tests/transformers/models/blip/__init__.py    |    0
 .../models/blip/test_modeling_blip.py         | 1077 -----
 .../models/blip/test_modeling_blip_text.py    |  178 -
 tests/transformers/models/blip_2/__init__.py  |    0
 .../models/blip_2/test_modeling_blip_2.py     | 1009 -----
 tests/transformers/models/bloom/__init__.py   |    0
 .../models/bloom/test_modeling_bloom.py       |  819 ----
 .../models/bridgetower/__init__.py            |    0
 .../bridgetower/test_modeling_bridgetower.py  |  592 ---
 tests/transformers/models/bros/__init__.py    |    0
 .../models/bros/test_modeling_bros.py         |  432 --
 .../transformers/models/camembert/__init__.py |    0
 .../camembert/test_modeling_camembert.py      |   55 -
 tests/transformers/models/canine/__init__.py  |    0
 .../models/canine/test_modeling_canine.py     |  574 ---
 .../models/canine/test_tokenization_canine.py |    0
 tests/transformers/models/chatglm/__init__.py |    0
 .../models/chatglm/test_modeling_chatglm.py   |  170 -
 .../models/chinese_clip/__init__.py           |    0
 .../test_modeling_chinese_clip.py             |  707 ---
 tests/transformers/models/clap/__init__.py    |    0
 .../clap/test_feature_extraction_clap.py      |  560 ---
 .../models/clap/test_modeling_clap.py         |  717 ---
 .../models/clap/test_processor_clap.py        |  125 -
 tests/transformers/models/clip/__init__.py    |    0
 .../models/clip/test_modeling_clip.py         |  668 ---
 .../models/clip/test_tokenization_clip.py     |  183 -
 tests/transformers/models/clipseg/__init__.py |    0
 .../models/clipseg/test_modeling_clipseg.py   |  627 ---
 tests/transformers/models/clvp/__init__.py    |    0
 .../clvp/test_feature_extraction_clvp.py      |  238 -
 .../models/clvp/test_modeling_clvp.py         |  637 ---
 .../models/clvp/test_processor_clvp.py        |  136 -
 .../models/clvp/test_tokenization_clvp.py     |  317 --
 tests/transformers/models/codegen/__init__.py |    0
 .../models/codegen/test_modeling_codegen.py   |  564 ---
 tests/transformers/models/cohere/__init__.py  |    0
 .../models/cohere/test_modeling_cohere.py     |  326 --
 .../models/conditional_detr/__init__.py       |    0
 .../test_image_processing_conditional_detr.py |  595 ---
 .../test_modeling_conditional_detr.py         |  600 ---
 .../transformers/models/convbert/__init__.py  |    0
 .../models/convbert/test_modeling_convbert.py |  483 --
 .../transformers/models/convnext/__init__.py  |    0
 .../models/convnext/test_modeling_convnext.py |  301 --
 .../models/convnextv2/__init__.py             |    0
 .../convnextv2/test_modeling_convnextv2.py    |  332 --
 tests/transformers/models/cpmant/__init__.py  |    0
 .../models/cpmant/test_modeling_cpmant.py     |  235 -
 tests/transformers/models/cpmbee/__init__.py  |    0
 .../models/cpmbee/test_modeling_cpmbee.py     |  208 -
 tests/transformers/models/ctrl/__init__.py    |    0
 .../models/ctrl/test_modeling_ctrl.py         |  318 --
 tests/transformers/models/cvt/__init__.py     |    0
 .../models/cvt/test_modeling_cvt.py           |  271 --
 .../transformers/models/data2vec/__init__.py  |    0
 .../data2vec/test_modeling_data2vec_audio.py  |  723 ---
 .../data2vec/test_modeling_data2vec_text.py   |  700 ---
 .../data2vec/test_modeling_data2vec_vision.py |  361 --
 tests/transformers/models/dbrx/__init__.py    |    0
 .../models/dbrx/test_modeling_dbrx.py         |  399 --
 tests/transformers/models/deberta/__init__.py |    0
 .../models/deberta/test_modeling_deberta.py   |  297 --
 .../models/deberta_v2/__init__.py             |    0
 .../deberta_v2/test_modeling_deberta_v2.py    |  314 --
 .../models/decision_transformer/__init__.py   |    0
 .../test_modeling_decision_transformer.py     |  247 -
 .../models/deepseek_v2/__init__.py            |    0
 .../models/deepseek_v2/testing_deepseek_v2.py |    9 -
 .../models/deformable_detr/__init__.py        |    0
 .../test_image_processing_deformable_detr.py  |  756 ----
 .../test_modeling_deformable_detr.py          |  890 ----
 tests/transformers/models/deit/__init__.py    |    0
 .../models/deit/test_image_processing_deit.py |  118 -
 .../models/deit/test_modeling_deit.py         |  426 --
 .../models/depth_anything/__init__.py         |    0
 .../test_modeling_depth_anything.py           |  285 --
 tests/transformers/models/deta/__init__.py    |    0
 .../models/deta/test_image_processing_deta.py |  703 ---
 .../models/deta/test_modeling_deta.py         |  872 ----
 tests/transformers/models/detr/__init__.py    |    0
 .../models/detr/test_image_processing_detr.py |  555 ---
 .../models/detr/test_modeling_detr.py         |  671 ---
 tests/transformers/models/dinov2/__init__.py  |    0
 .../models/dinov2/test_modeling_dinov2.py     |  327 --
 tests/transformers/models/donut/__init__.py   |    0
 .../donut/test_image_processing_donut.py      |  231 -
 .../models/donut/test_modeling_donut_swin.py  |  349 --
 .../models/donut/test_processing_donut.py     |   51 -
 tests/transformers/models/dpr/__init__.py     |    0
 .../models/dpr/test_modeling_dpr.py           |  298 --
 .../models/dpr/test_tokenization_dpr.py       |   88 -
 tests/transformers/models/dpt/__init__.py     |    0
 .../models/dpt/test_image_processing_dpt.py   |  144 -
 .../models/dpt/test_modeling_dpt.py           |  371 --
 .../dpt/test_modeling_dpt_auto_backbone.py    |  330 --
 .../models/dpt/test_modeling_dpt_hybrid.py    |  417 --
 .../models/efficientformer/__init__.py        |    0
 .../test_image_processing_efficientformer.py  |  100 -
 .../test_modeling_efficientformer.py          |  416 --
 tests/transformers/models/electra/__init__.py |    0
 .../models/electra/test_modeling_electra.py   |  480 --
 tests/transformers/models/encodec/__init__.py |    0
 .../models/encodec/test_modeling_encodec.py   |  539 ---
 tests/transformers/models/ernie/__init__.py   |    0
 .../models/ernie/test_modeling_ernie.py       |  572 ---
 tests/transformers/models/ernie_m/__init__.py |    0
 .../models/ernie_m/test_modeling_ernie_m.py   |  322 --
 tests/transformers/models/esm/__init__.py     |    0
 .../models/esm/test_modeling_esm.py           |  335 --
 .../models/esm/test_modeling_esmfold.py       |  285 --
 tests/transformers/models/falcon/__init__.py  |    0
 .../models/falcon/test_modeling_falcon.py     |  609 ---
 .../models/fastspeech2_conformer/__init__.py  |    0
 .../test_modeling_fastspeech2_conformer.py    |  781 ----
 ...test_tokenization_fastspeech2_conformer.py |  196 -
 .../transformers/models/flaubert/__init__.py  |    0
 .../models/flaubert/test_modeling_flaubert.py |  460 --
 .../flaubert/test_tokenization_flaubert.py    |   86 -
 tests/transformers/models/flava/__init__.py   |    0
 .../flava/test_image_processing_flava.py      |  386 --
 .../models/flava/test_modeling_flava.py       | 1332 ------
 .../models/flava/test_processor_flava.py      |  244 -
 .../transformers/models/florence2/__init__.py |    0
 .../florence2/testing_modeling_florence2.py   |   30 -
 tests/transformers/models/fnet/__init__.py    |    0
 .../models/fnet/test_modeling_fnet.py         |  756 ----
 .../transformers/models/focalnet/__init__.py  |    0
 .../models/focalnet/test_modeling_focalnet.py |  428 --
 tests/transformers/models/fsmt/__init__.py    |    0
 .../models/fsmt/test_modeling_fsmt.py         |  616 ---
 tests/transformers/models/funnel/__init__.py  |    0
 .../models/funnel/test_modeling_funnel.py     |  518 ---
 .../models/funnel/test_tokenization_funnel.py |   80 -
 tests/transformers/models/fuyu/__init__.py    |    0
 .../models/fuyu/test_modeling_fuyu.py         |  402 --
 tests/transformers/models/gemma/__init__.py   |    0
 .../models/gemma/test_modeling_gemma.py       |  536 ---
 tests/transformers/models/gemma2/__init__.py  |    0
 .../models/gemma2/test_modeling_gemma2.py     |  164 -
 tests/transformers/models/git/__init__.py     |    0
 .../models/git/test_modeling_git.py           |  600 ---
 .../models/git/test_processor_git.py          |  153 -
 tests/transformers/models/gpt2/__init__.py    |    0
 .../models/gpt2/test_modeling_gpt2.py         |  828 ----
 .../models/gpt_bigcode/__init__.py            |    0
 .../gpt_bigcode/test_modeling_gpt_bigcode.py  |  613 ---
 tests/transformers/models/gpt_neo/__init__.py |    0
 .../models/gpt_neo/test_modeling_gpt_neo.py   |  595 ---
 .../transformers/models/gpt_neox/__init__.py  |    0
 .../models/gpt_neox/test_modeling_gpt_neox.py |  359 --
 .../models/gpt_neox_japanese/__init__.py      |    0
 .../test_modeling_gpt_neox_japanese.py        |  314 --
 tests/transformers/models/gptj/__init__.py    |    0
 .../models/gptj/test_modeling_gptj.py         |  601 ---
 .../models/gptsan_japanese/__init__.py        |    0
 .../test_modeling_gptsan_japanese.py          |  509 ---
 .../models/graphormer/__init__.py             |   17 -
 .../graphormer/test_graphormer_cells.py       |  149 -
 .../graphormer/test_modeling_graphormer.py    | 1206 -----
 .../transformers/models/groupvit/__init__.py  |    0
 .../models/groupvit/test_modeling_groupvit.py |  589 ---
 tests/transformers/models/hubert/__init__.py  |    0
 .../models/hubert/test_modeling_hubert.py     |  739 ---
 tests/transformers/models/ibert/__init__.py   |    0
 .../models/ibert/test_modeling_ibert.py       |  738 ---
 tests/transformers/models/idefics/__init__.py |    0
 .../idefics/test_image_processing_idefics.py  |  206 -
 .../models/idefics/test_modeling_idefics.py   |  668 ---
 .../models/idefics/test_processor_idefics.py  |  210 -
 .../transformers/models/imagegpt/__init__.py  |    0
 .../test_image_processing_imagegpt.py         |  299 --
 .../models/imagegpt/test_modeling_imagegpt.py |  527 ---
 .../models/instructblip/__init__.py           |    0
 .../test_modeling_instructblip.py             |  666 ---
 .../transformers/models/internlm/__init__.py  |    0
 .../models/internlm/test_modeling_internlm.py |   54 -
 tests/transformers/models/jamba/__init__.py   |    0
 .../models/jamba/test_modeling_jamba.py       |  537 ---
 tests/transformers/models/jetmoe/__init__.py  |    0
 .../models/jetmoe/test_modeling_jetmoe.py     |  430 --
 tests/transformers/models/kosmos2/__init__.py |    0
 .../models/kosmos2/test_modeling_kosmos2.py   |  675 ---
 .../models/kosmos2/test_processor_kosmos2.py  |  481 --
 .../transformers/models/layoutlm/__init__.py  |    0
 .../models/layoutlm/test_modeling_layoutlm.py |  394 --
 .../models/layoutlmv2/__init__.py             |    0
 .../layoutlmv2/test_modeling_layoutlmv2.py    |  572 ---
 .../models/layoutlmv3/__init__.py             |    0
 .../layoutlmv3/test_modeling_layoutlmv3.py    |  408 --
 tests/transformers/models/led/__init__.py     |    0
 .../models/led/test_modeling_led.py           |  599 ---
 tests/transformers/models/lilt/__init__.py    |    0
 .../models/lilt/test_modeling_lilt.py         |  345 --
 tests/transformers/models/llama/__init__.py   |    0
 .../models/llama/test_modeling_llama.py       |  990 ----
 tests/transformers/models/llava/__init__.py   |    0
 .../models/llava/test_modeling_llava.py       |  677 ---
 .../models/llava_next/__init__.py             |    0
 .../test_image_processor_llava_next.py        |  201 -
 .../llava_next/test_modeling_llava_next.py    |  588 ---
 .../models/longformer/__init__.py             |    0
 .../longformer/test_modeling_longformer.py    |  755 ----
 tests/transformers/models/luke/__init__.py    |    0
 .../models/luke/test_modeling_luke.py         |  906 ----
 tests/transformers/models/lxmert/__init__.py  |    0
 .../models/lxmert/test_modeling_lxmert.py     |  908 ----
 tests/transformers/models/m2m_100/__init__.py |    0
 .../models/m2m_100/test_modeling_m2m_100.py   |  453 --
 .../m2m_100/test_tokenization_m2m_100.py      |  242 -
 tests/transformers/models/mamba/__init__.py   |    0
 .../models/mamba/test_modeling_graph_mamba.py |  124 -
 .../models/mamba/test_modeling_mamba.py       |  502 ---
 tests/transformers/models/marian/__init__.py  |    0
 .../models/marian/test_modeling_marian.py     |  847 ----
 .../transformers/models/markuplm/__init__.py  |    0
 .../models/markuplm/test_modeling_markuplm.py |  378 --
 .../models/mask2former/__init__.py            |    0
 .../test_image_processing_mask2former.py      |  496 --
 .../mask2former/test_modeling_mask2former.py  |  441 --
 .../models/maskformer/__init__.py             |    0
 .../maskformer/test_modeling_maskformer.py    |  605 ---
 tests/transformers/models/mbart/__init__.py   |    0
 .../models/mbart/test_modeling_mbart.py       |  726 ---
 tests/transformers/models/mbart50/__init__.py |    0
 .../mbart50/test_tokenization_mbart50.py      |  312 --
 tests/transformers/models/mctct/__init__.py   |    0
 .../mctct/test_feature_extraction_mctct.py    |  311 --
 .../models/mctct/test_modeling_mctct.py       |  653 ---
 .../models/mctct/test_processor_mctct.py      |  157 -
 .../models/megatron_bert/__init__.py          |    0
 .../test_modeling_megatron_bert.py            |  382 --
 tests/transformers/models/mgp_str/__init__.py |    0
 .../models/mgp_str/test_modeling_mgp_str.py   |  286 --
 .../models/mgp_str/test_processor_mgp_str.py  |  252 --
 tests/transformers/models/mistral/__init__.py |    0
 .../models/mistral/test_modeling_mistral.py   |  426 --
 tests/transformers/models/mixtral/__init__.py |    0
 .../models/mixtral/test_modeling_mixtral.py   |  459 --
 tests/transformers/models/mllama/__init__.py  |    0
 .../mllama/test_image_processing_mllama.py    |  356 --
 .../models/mllama/test_modeling_mllama.py     |  578 ---
 .../models/mllama/test_processor_mllama.py    |  179 -
 tests/transformers/models/mluke/__init__.py   |    0
 .../models/mluke/test_tokenization_mluke.py   |  676 ---
 .../models/mobilebert/__init__.py             |    0
 .../models/mobilenet_v1/__init__.py           |    0
 .../test_image_processing_mobilenet_v1.py     |  104 -
 .../test_modeling_mobilenet_v1.py             |  252 --
 .../models/mobilenet_v2/__init__.py           |    0
 .../test_image_processing_mobilenet_v2.py     |  106 -
 .../test_modeling_mobilenet_v2.py             |  325 --
 .../transformers/models/mobilevit/__init__.py |    0
 .../test_image_processing_mobilevit.py        |  243 -
 .../mobilevit/test_modeling_mobilevit.py      |  361 --
 .../models/mobilevitv2/__init__.py            |    0
 .../mobilevitv2/test_modeling_mobilevitv2.py  |  367 --
 tests/transformers/models/mpnet/__init__.py   |    0
 .../models/mpnet/test_modeling_mpnet.py       |  256 --
 tests/transformers/models/mpt/__init__.py     |    0
 .../models/mpt/test_modeling_mpt.py           |  503 ---
 tests/transformers/models/mt5/__init__.py     |    0
 .../models/mt5/test_modeling_mt5.py           |  949 ----
 .../transformers/models/musicgen/__init__.py  |    0
 .../models/musicgen/test_modeling_musicgen.py | 1297 ------
 .../models/musicgen_melody/__init__.py        |    0
 ...test_feature_extraction_musicgen_melody.py |  224 -
 .../test_modeling_musicgen_melody.py          | 1289 ------
 .../test_processor_musicgen_melody.py         |    0
 tests/transformers/models/mvp/__init__.py     |    0
 .../models/mvp/test_modeling_mvp.py           |  812 ----
 tests/transformers/models/nllb/__init__.py    |    0
 .../models/nllb/test_tokenization_nllb.py     |  470 --
 .../transformers/models/nllb_moe/__init__.py  |    0
 .../models/nllb_moe/test_modeling_nllb_moe.py |  674 ---
 tests/transformers/models/nougat/__init__.py  |    0
 .../nougat/test_image_processing_nougat.py    |  221 -
 .../models/nougat/test_modeling_nougat.py     |    0
 .../models/nystromformer/__init__.py          |    0
 .../test_modeling_nystromformer.py            |  296 --
 tests/transformers/models/olmo/__init__.py    |    0
 .../models/olmo/test_modeling_olmo.py         |  439 --
 .../transformers/models/oneformer/__init__.py |    0
 .../test_image_processing_oneformer.py        |  351 --
 .../oneformer/test_modeling_oneformer.py      |  518 ---
 .../oneformer/test_processor_oneformer.py     |  804 ----
 tests/transformers/models/openai/__init__.py  |    0
 .../models/openai/test_modeling_gpt.py        |  302 --
 tests/transformers/models/opt/__init__.py     |    0
 .../models/opt/test_modeling_opt.py           |  561 ---
 tests/transformers/models/owlv2/__init__.py   |    0
 .../owlv2/test_image_processor_owlv2.py       |  201 -
 .../models/owlv2/test_modeling_owlv2.py       |  770 ----
 tests/transformers/models/owlvit/__init__.py  |    0
 .../owlvit/test_image_processing_owlvit.py    |  126 -
 .../models/owlvit/test_modeling_owlvit.py     |  758 ----
 .../models/owlvit/test_processor_owlvit.py    |  293 --
 .../transformers/models/patchtst/__init__.py  |    0
 .../models/patchtst/test_modeling_patchtst.py |  384 --
 tests/transformers/models/pegasus/__init__.py |    0
 .../models/pegasus/test_modeling_pegasus.py   |  602 ---
 .../transformers/models/pegasus_x/__init__.py |    0
 .../pegasus_x/test_modeling_pegasus_x.py      |  853 ----
 .../transformers/models/perceiver/__init__.py |    0
 .../perceiver/test_modeling_perceiver.py      | 1004 -----
 .../perceiver/test_tokenization_perceiver.py  |  299 --
 .../transformers/models/persimmon/__init__.py |    0
 .../persimmon/test_modeling_persimmon.py      |  520 ---
 tests/transformers/models/phi/__init__.py     |    0
 .../models/phi/test_modeling_phi.py           |  451 --
 tests/transformers/models/phi3/__init__.py    |    0
 .../models/phi3/test_modeling_phi3.py         |  562 ---
 .../models/pix2struct/__init__.py             |    0
 .../test_image_processing_pix2struct.py       |  355 --
 .../pix2struct/test_modeling_pix2struct.py    |  846 ----
 .../pix2struct/test_processor_pix2struct.py   |  193 -
 tests/transformers/models/plbart/__init__.py  |    0
 .../models/plbart/test_modeling_plbart.py     |  666 ---
 .../models/poolformer/__init__.py             |    0
 .../test_image_processing_poolformer.py       |  120 -
 .../poolformer/test_modeling_poolformer.py    |  244 -
 .../transformers/models/pop2piano/__init__.py |    0
 .../pop2piano/test_modeling_pop2piano.py      |  754 ----
 .../models/prophetnet/__init__.py             |    0
 .../prophetnet/test_modeling_prophetnet.py    | 1287 ------
 .../test_tokenization_prophetnet.py           |    0
 tests/transformers/models/qdqbert/__init__.py |    0
 .../models/qdqbert/test_modeling_qdqbert.py   |  720 ---
 tests/transformers/models/qwen2/__init__.py   |    0
 .../models/qwen2/test_modeling_qwen2.py       |  485 --
 .../transformers/models/qwen2_moe/__init__.py |    0
 .../qwen2_moe/test_modeling_qwen2_moe.py      |  566 ---
 .../transformers/models/qwen2_vl/__init__.py  |    0
 .../models/qwen2_vl/test_modeling_qwen2_vl.py |  432 --
 tests/transformers/models/rag/__init__.py     |    0
 .../models/rag/test_modeling_rag.py           | 1191 -----
 .../models/rag/test_retrieval_rag.py          |  320 --
 .../models/rag/test_tokenization_rag.py       |  168 -
 tests/transformers/models/realm/__init__.py   |    0
 .../models/realm/test_modeling_realm.py       |  547 ---
 .../transformers/models/reformer/__init__.py  |    0
 .../models/reformer/test_modeling_reformer.py | 1299 ------
 tests/transformers/models/rembert/__init__.py |    0
 .../models/rembert/test_modeling_rembert.py   |  501 ---
 tests/transformers/models/resnet/__init__.py  |    0
 .../models/resnet/test_modeling_resnet.py     |  314 --
 tests/transformers/models/roberta/__init__.py |    0
 .../models/roberta/test_modeling_roberta.py   |  570 ---
 .../models/roberta_prelayernorm/__init__.py   |    0
 .../test_modeling_roberta_prelayernorm.py     |  560 ---
 .../transformers/models/roc_bert/__init__.py  |    0
 .../models/roc_bert/test_modeling_roc_bert.py |  828 ----
 tests/transformers/models/rwkv/__init__.py    |    0
 .../models/rwkv/test_modeling_rwkv.py         |  485 --
 tests/transformers/models/sam/__init__.py     |    0
 .../models/sam/test_modeling_sam.py           |  736 ---
 .../models/sam/test_processor_sam.py          |  171 -
 .../models/seamless_m4t/__init__.py           |    0
 .../test_modeling_seamless_m4t.py             | 1132 -----
 .../models/seamless_m4t_v2/__init__.py        |    0
 .../test_modeling_seamless_m4t_v2.py          | 1187 -----
 .../transformers/models/segformer/__init__.py |    0
 .../segformer/test_modeling_segformer.py      |  416 --
 tests/transformers/models/seggpt/__init__.py  |    0
 .../seggpt/test_image_processing_seggpt.py    |  356 --
 .../models/seggpt/test_modeling_seggpt.py     |  463 --
 tests/transformers/models/sew/__init__.py     |    0
 .../models/sew/test_modeling_sew.py           |  597 ---
 tests/transformers/models/sew_d/__init__.py   |    0
 .../models/sew_d/test_modeling_sew_d.py       |  546 ---
 .../models/speech_encoder_decoder/__init__.py |    0
 .../test_modeling_speech_encoder_decoder.py   |  789 ----
 .../models/speech_to_text/__init__.py         |    0
 .../test_modeling_speech_to_text.py           |  763 ----
 .../test_tokenization_speech_to_text.py       |  195 -
 .../transformers/models/speecht5/__init__.py  |    0
 .../test_feature_extraction_speecht5.py       |  422 --
 .../models/speecht5/test_modeling_speecht5.py | 1871 --------
 .../speecht5/test_processor_speecht5.py       |  185 -
 .../speecht5/test_tokenization_speecht5.py    |  222 -
 .../transformers/models/splinter/__init__.py  |    0
 .../models/splinter/test_modeling_splinter.py |  653 ---
 .../models/squeezebert/__init__.py            |    0
 .../squeezebert/test_modeling_squeezebert.py  |  383 --
 .../transformers/models/stablelm/__init__.py  |    0
 .../models/stablelm/test_modeling_stablelm.py |  548 ---
 .../models/starcoder2/__init__.py             |    0
 .../starcoder2/test_modeling_starcoder2.py    |  474 --
 .../models/superpoint/__init__.py             |    0
 .../test_image_processing_superpoint.py       |  112 -
 .../superpoint/test_modeling_superpoint.py    |  308 --
 .../models/swiftformer/__init__.py            |    0
 .../swiftformer/test_modeling_swiftformer.py  |  284 --
 tests/transformers/models/swin/__init__.py    |    0
 .../models/swin/test_modeling_swin.py         |  513 ---
 tests/transformers/models/swin2sr/__init__.py |    0
 .../swin2sr/test_image_processing_swin2sr.py  |  180 -
 .../models/swin2sr/test_modeling_swin2sr.py   |  350 --
 .../models/switch_transformers/__init__.py    |    0
 .../test_modeling_switch_transformers.py      | 1089 -----
 tests/transformers/models/t5/__init__.py      |    0
 .../models/t5/test_modeling_t5.py             | 1517 -------
 tests/transformers/models/tapas/__init__.py   |    0
 .../models/tapas/test_modeling_tapas.py       | 1084 -----
 .../models/tapas/test_tokenization_tapas.py   | 1189 -----
 tests/transformers/models/tapex/__init__.py   |    0
 .../models/tapex/test_tokenization_tapex.py   |  912 ----
 .../time_series_transformer/__init__.py       |    0
 .../test_modeling_time_series_transformer.py  |  551 ---
 .../models/timesformer/__init__.py            |    0
 .../timesformer/test_modeling_timesformer.py  |  346 --
 .../transformers/models/tinybert/__init__.py  |    0
 tests/transformers/models/trocr/__init__.py   |    0
 .../models/trocr/test_modeling_trocr.py       |  197 -
 tests/transformers/models/tvlt/__init__.py    |    0
 .../models/tvlt/test_modeling_tvlt.py         |  604 ---
 tests/transformers/models/udop/__init__.py    |    0
 .../models/udop/test_modeling_udop.py         |  575 ---
 .../models/udop/test_processor_udop.py        |  510 ---
 .../models/udop/test_tokenization_udop.py     | 1956 --------
 tests/transformers/models/umt5/__init__.py    |    0
 .../models/umt5/test_modeling_umt5.py         |  626 ---
 .../transformers/models/unispeech/__init__.py |    0
 .../unispeech/test_modeling_unispeech.py      |  585 ---
 .../models/unispeech_sat/__init__.py          |    0
 .../test_modeling_unispeech_sat.py            |  929 ----
 tests/transformers/models/univnet/__init__.py |    0
 .../test_feature_extraction_univnet.py        |  368 --
 .../models/univnet/test_modeling_univnet.py   |  438 --
 tests/transformers/models/upernet/__init__.py |    0
 .../models/upernet/test_modeling_upernet.py   |  300 --
 .../transformers/models/videomae/__init__.py  |    0
 .../test_image_processing_videomae.py         |  214 -
 .../models/videomae/test_modeling_videomae.py |  398 --
 tests/transformers/models/vilt/__init__.py    |    0
 .../models/vilt/test_image_processing_vilt.py |  155 -
 .../models/vilt/test_modeling_vilt.py         |  654 ---
 .../transformers/models/vipllava/__init__.py  |    0
 .../models/vipllava/test_modeling_vipllava.py |  456 --
 .../models/vision_encoder_decoder/__init__.py |    0
 .../test_modeling_vision_encoder_decoder.py   | 1178 -----
 .../vision_text_dual_encoder/__init__.py      |    0
 .../test_modeling_vision_text_dual_encoder.py |  430 --
 ...test_processor_vision_text_dual_encoder.py |  182 -
 .../models/visual_bert/__init__.py            |    0
 .../visual_bert/test_modeling_visual_bert.py  |  694 ---
 tests/transformers/models/vit/__init__.py     |    0
 .../models/vit/test_image_processing_vit.py   |   92 -
 .../models/vit/test_modeling_vit.py           |  302 --
 .../models/vit_hybrid/__init__.py             |    0
 .../vit_hybrid/test_modeling_vit_hybrid.py    |  279 --
 tests/transformers/models/vit_mae/__init__.py |    0
 .../models/vit_mae/test_modeling_vit_mae.py   |  317 --
 tests/transformers/models/vit_msn/__init__.py |    0
 .../models/vit_msn/test_modeling_vit_msn.py   |  226 -
 tests/transformers/models/vitdet/__init__.py  |    0
 .../models/vitdet/test_modeling_vitdet.py     |  302 --
 .../transformers/models/vitmatte/__init__.py  |    0
 .../test_image_processing_vitmatte.py         |  200 -
 .../models/vitmatte/test_modeling_vitmatte.py |  289 --
 tests/transformers/models/vits/__init__.py    |    0
 .../models/vits/test_modeling_vits.py         |  428 --
 tests/transformers/models/vivit/__init__.py   |    0
 .../models/vivit/test_image_processing_vit.py |  230 -
 .../models/vivit/test_modeling_vivit.py       |  361 --
 .../transformers/models/wav2vec2/__init__.py  |    0
 .../test_feature_extraction_wav2vec2.py       |  233 -
 .../models/wav2vec2/test_modeling_wav2vec2.py | 1739 --------
 .../wav2vec2/test_processor_wav2vec2.py       |  153 -
 .../wav2vec2/test_tokenization_wav2vec2.py    |  827 ----
 .../models/wav2vec2_bert/__init__.py          |    0
 .../test_modeling_wav2vec2_bert.py            |  899 ----
 .../test_processor_wav2vec2_bert.py           |  157 -
 .../models/wav2vec2_conformer/__init__.py     |    0
 .../test_modeling_wav2vec2_conformer.py       |  936 ----
 tests/transformers/models/wavlm/__init__.py   |    0
 .../models/wavlm/test_modeling_wavlm.py       |  614 ---
 tests/transformers/models/whisper/__init__.py |    0
 .../models/whisper/test_modeling_whisper.py   | 3226 -------------
 tests/transformers/models/x_clip/__init__.py  |    0
 .../models/x_clip/test_modeling_x_clip.py     |  712 ---
 tests/transformers/models/xlm/__init__.py     |    0
 .../models/xlm/test_modeling_xlm.py           |  539 ---
 .../models/xlm/test_tokenization_xlm.py       |   97 -
 .../models/xlm_prophetnet/__init__.py         |    0
 .../test_modeling_xlm_prophetnet.py           |  141 -
 .../models/xlm_roberta/__init__.py            |    0
 .../xlm_roberta/test_modeling_xlm_roberta.py  |   75 -
 .../models/xlm_roberta_xl/__init__.py         |    0
 .../test_modeling_xlm_roberta_xl.py           |  546 ---
 tests/transformers/models/xlnet/__init__.py   |    0
 .../models/xlnet/test_modeling_xlnet.py       |  728 ---
 .../models/xlnet/test_tokenization_xlnet.py   |  261 --
 tests/transformers/models/xmod/__init__.py    |    0
 .../models/xmod/test_modeling_xmod.py         |  674 ---
 tests/transformers/models/yolos/__init__.py   |    0
 .../yolos/test_image_processing_yolos.py      |  532 ---
 .../models/yolos/test_modeling_yolos.py       |  363 --
 tests/transformers/pipelines/__init__.py      |    0
 .../test_pipelines_audio_classification.py    |  140 -
 ..._pipelines_automatic_speech_recognition.py | 1954 --------
 .../pipelines/test_pipelines_common.py        |  472 --
 .../test_pipelines_depth_estimation.py        |  149 -
 ...t_pipelines_document_question_answering.py |  344 --
 .../test_pipelines_feature_extraction.py      |  165 -
 .../pipelines/test_pipelines_fill_mask.py     |  411 --
 .../test_pipelines_image_classification.py    |  289 --
 ...test_pipelines_image_feature_extraction.py |  148 -
 .../test_pipelines_image_segmentation.py      |  740 ---
 .../test_pipelines_question_answering.py      |  491 --
 ...test_pipelines_table_question_answering.py |  353 --
 .../test_pipelines_text2text_generation.py    |   97 -
 .../test_pipelines_text_classification.py     |  157 -
 .../test_pipelines_text_generation.py         |  354 --
 ...test_pipelines_zero_shot_classification.py |  233 -
 tests/transformers/test_backbone_common.py    |  227 -
 .../transformers/test_configuration_common.py |  205 -
 .../test_feature_extraction_common.py         |   55 -
 .../test_image_processing_common.py           |  369 --
 tests/transformers/test_modeling_common.py    | 3268 --------------
 tests/transformers/test_pipeline_mixin.py     |    0
 ...test_sequence_feature_extraction_common.py |  394 --
 .../transformers/test_tokenization_common.py  | 3973 -----------------
 584 files changed, 315 insertions(+), 176080 deletions(-)
 create mode 100644 mindnlp/core/distributed/fsdp/fully_sharded_data_parallel.py
 rename tests/peft/__init__.py => mindnlp/core/fx/_compatibility.py (100%)
 delete mode 100644 tests/peft/test_config.py
 create mode 100644 tests/run_test.py
 delete mode 100644 tests/transformers/__init__.py
 delete mode 100644 tests/transformers/generation/__init__.py
 delete mode 100644 tests/transformers/generation/test_framework_agnostic.py
 delete mode 100644 tests/transformers/generation/test_utils.py
 delete mode 100644 tests/transformers/models/__init__.py
 delete mode 100644 tests/transformers/models/albert/__init__.py
 delete mode 100644 tests/transformers/models/albert/test_modeling_albert.py
 delete mode 100644 tests/transformers/models/albert/test_tokenization_albert.py
 delete mode 100644 tests/transformers/models/align/__init__.py
 delete mode 100644 tests/transformers/models/align/test_modeling_align.py
 delete mode 100644 tests/transformers/models/altclip/__init__.py
 delete mode 100644 tests/transformers/models/altclip/test_modeling_altclip.py
 delete mode 100644 tests/transformers/models/audio_spectrogram_transformer/__init__.py
 delete mode 100644 tests/transformers/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
 delete mode 100644 tests/transformers/models/auto/__init__.py
 delete mode 100644 tests/transformers/models/auto/test_configuration_auto.py
 delete mode 100644 tests/transformers/models/auto/test_modeling_auto.py
 delete mode 100644 tests/transformers/models/autoformer/__init__.py
 delete mode 100644 tests/transformers/models/autoformer/test_modeling_autoformer.py
 delete mode 100644 tests/transformers/models/baichuan/__init__.py
 delete mode 100644 tests/transformers/models/bark/__init__.py
 delete mode 100644 tests/transformers/models/bark/test_modeling_bark.py
 delete mode 100644 tests/transformers/models/bart/__init__.py
 delete mode 100644 tests/transformers/models/bart/test_modeling_bart.py
 delete mode 100644 tests/transformers/models/beit/__init__.py
 delete mode 100644 tests/transformers/models/beit/test_modeling_beit.py
 delete mode 100644 tests/transformers/models/bert/__init__.py
 delete mode 100644 tests/transformers/models/bert/test_modeling_bert.py
 delete mode 100644 tests/transformers/models/bert/test_tokenization_bert.py
 delete mode 100644 tests/transformers/models/bert_generation/__init__.py
 delete mode 100644 tests/transformers/models/bert_generation/test_modeling_bert_generation.py
 delete mode 100644 tests/transformers/models/big_bird/__init__.py
 delete mode 100644 tests/transformers/models/big_bird/test_modeling_big_bird.py
 delete mode 100644 tests/transformers/models/bigbird_pegasus/__init__.py
 delete mode 100644 tests/transformers/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
 delete mode 100644 tests/transformers/models/biogpt/__init__.py
 delete mode 100644 tests/transformers/models/biogpt/test_modeling_biogpt.py
 delete mode 100644 tests/transformers/models/bit/__init__.py
 delete mode 100644 tests/transformers/models/bit/test_modeling_bit.py
 delete mode 100644 tests/transformers/models/blenderbot/__init__.py
 delete mode 100644 tests/transformers/models/blenderbot/test_modeling_blenderbot.py
 delete mode 100644 tests/transformers/models/blenderbot_small/__init__.py
 delete mode 100644 tests/transformers/models/blenderbot_small/test_modeling_blenderbot_small.py
 delete mode 100644 tests/transformers/models/blip/__init__.py
 delete mode 100644 tests/transformers/models/blip/test_modeling_blip.py
 delete mode 100644 tests/transformers/models/blip/test_modeling_blip_text.py
 delete mode 100644 tests/transformers/models/blip_2/__init__.py
 delete mode 100644 tests/transformers/models/blip_2/test_modeling_blip_2.py
 delete mode 100644 tests/transformers/models/bloom/__init__.py
 delete mode 100644 tests/transformers/models/bloom/test_modeling_bloom.py
 delete mode 100644 tests/transformers/models/bridgetower/__init__.py
 delete mode 100644 tests/transformers/models/bridgetower/test_modeling_bridgetower.py
 delete mode 100644 tests/transformers/models/bros/__init__.py
 delete mode 100644 tests/transformers/models/bros/test_modeling_bros.py
 delete mode 100644 tests/transformers/models/camembert/__init__.py
 delete mode 100644 tests/transformers/models/camembert/test_modeling_camembert.py
 delete mode 100644 tests/transformers/models/canine/__init__.py
 delete mode 100644 tests/transformers/models/canine/test_modeling_canine.py
 delete mode 100644 tests/transformers/models/canine/test_tokenization_canine.py
 delete mode 100644 tests/transformers/models/chatglm/__init__.py
 delete mode 100644 tests/transformers/models/chatglm/test_modeling_chatglm.py
 delete mode 100644 tests/transformers/models/chinese_clip/__init__.py
 delete mode 100644 tests/transformers/models/chinese_clip/test_modeling_chinese_clip.py
 delete mode 100644 tests/transformers/models/clap/__init__.py
 delete mode 100644 tests/transformers/models/clap/test_feature_extraction_clap.py
 delete mode 100644 tests/transformers/models/clap/test_modeling_clap.py
 delete mode 100644 tests/transformers/models/clap/test_processor_clap.py
 delete mode 100644 tests/transformers/models/clip/__init__.py
 delete mode 100644 tests/transformers/models/clip/test_modeling_clip.py
 delete mode 100644 tests/transformers/models/clip/test_tokenization_clip.py
 delete mode 100644 tests/transformers/models/clipseg/__init__.py
 delete mode 100644 tests/transformers/models/clipseg/test_modeling_clipseg.py
 delete mode 100644 tests/transformers/models/clvp/__init__.py
 delete mode 100644 tests/transformers/models/clvp/test_feature_extraction_clvp.py
 delete mode 100644 tests/transformers/models/clvp/test_modeling_clvp.py
 delete mode 100644 tests/transformers/models/clvp/test_processor_clvp.py
 delete mode 100644 tests/transformers/models/clvp/test_tokenization_clvp.py
 delete mode 100644 tests/transformers/models/codegen/__init__.py
 delete mode 100644 tests/transformers/models/codegen/test_modeling_codegen.py
 delete mode 100644 tests/transformers/models/cohere/__init__.py
 delete mode 100644 tests/transformers/models/cohere/test_modeling_cohere.py
 delete mode 100644 tests/transformers/models/conditional_detr/__init__.py
 delete mode 100644 tests/transformers/models/conditional_detr/test_image_processing_conditional_detr.py
 delete mode 100644 tests/transformers/models/conditional_detr/test_modeling_conditional_detr.py
 delete mode 100644 tests/transformers/models/convbert/__init__.py
 delete mode 100644 tests/transformers/models/convbert/test_modeling_convbert.py
 delete mode 100644 tests/transformers/models/convnext/__init__.py
 delete mode 100644 tests/transformers/models/convnext/test_modeling_convnext.py
 delete mode 100644 tests/transformers/models/convnextv2/__init__.py
 delete mode 100644 tests/transformers/models/convnextv2/test_modeling_convnextv2.py
 delete mode 100644 tests/transformers/models/cpmant/__init__.py
 delete mode 100644 tests/transformers/models/cpmant/test_modeling_cpmant.py
 delete mode 100644 tests/transformers/models/cpmbee/__init__.py
 delete mode 100644 tests/transformers/models/cpmbee/test_modeling_cpmbee.py
 delete mode 100644 tests/transformers/models/ctrl/__init__.py
 delete mode 100644 tests/transformers/models/ctrl/test_modeling_ctrl.py
 delete mode 100644 tests/transformers/models/cvt/__init__.py
 delete mode 100644 tests/transformers/models/cvt/test_modeling_cvt.py
 delete mode 100644 tests/transformers/models/data2vec/__init__.py
 delete mode 100644 tests/transformers/models/data2vec/test_modeling_data2vec_audio.py
 delete mode 100644 tests/transformers/models/data2vec/test_modeling_data2vec_text.py
 delete mode 100644 tests/transformers/models/data2vec/test_modeling_data2vec_vision.py
 delete mode 100644 tests/transformers/models/dbrx/__init__.py
 delete mode 100644 tests/transformers/models/dbrx/test_modeling_dbrx.py
 delete mode 100644 tests/transformers/models/deberta/__init__.py
 delete mode 100644 tests/transformers/models/deberta/test_modeling_deberta.py
 delete mode 100644 tests/transformers/models/deberta_v2/__init__.py
 delete mode 100644 tests/transformers/models/deberta_v2/test_modeling_deberta_v2.py
 delete mode 100644 tests/transformers/models/decision_transformer/__init__.py
 delete mode 100644 tests/transformers/models/decision_transformer/test_modeling_decision_transformer.py
 delete mode 100644 tests/transformers/models/deepseek_v2/__init__.py
 delete mode 100644 tests/transformers/models/deepseek_v2/testing_deepseek_v2.py
 delete mode 100644 tests/transformers/models/deformable_detr/__init__.py
 delete mode 100644 tests/transformers/models/deformable_detr/test_image_processing_deformable_detr.py
 delete mode 100644 tests/transformers/models/deformable_detr/test_modeling_deformable_detr.py
 delete mode 100644 tests/transformers/models/deit/__init__.py
 delete mode 100644 tests/transformers/models/deit/test_image_processing_deit.py
 delete mode 100644 tests/transformers/models/deit/test_modeling_deit.py
 delete mode 100644 tests/transformers/models/depth_anything/__init__.py
 delete mode 100644 tests/transformers/models/depth_anything/test_modeling_depth_anything.py
 delete mode 100644 tests/transformers/models/deta/__init__.py
 delete mode 100644 tests/transformers/models/deta/test_image_processing_deta.py
 delete mode 100644 tests/transformers/models/deta/test_modeling_deta.py
 delete mode 100644 tests/transformers/models/detr/__init__.py
 delete mode 100644 tests/transformers/models/detr/test_image_processing_detr.py
 delete mode 100644 tests/transformers/models/detr/test_modeling_detr.py
 delete mode 100644 tests/transformers/models/dinov2/__init__.py
 delete mode 100644 tests/transformers/models/dinov2/test_modeling_dinov2.py
 delete mode 100644 tests/transformers/models/donut/__init__.py
 delete mode 100644 tests/transformers/models/donut/test_image_processing_donut.py
 delete mode 100644 tests/transformers/models/donut/test_modeling_donut_swin.py
 delete mode 100644 tests/transformers/models/donut/test_processing_donut.py
 delete mode 100644 tests/transformers/models/dpr/__init__.py
 delete mode 100644 tests/transformers/models/dpr/test_modeling_dpr.py
 delete mode 100644 tests/transformers/models/dpr/test_tokenization_dpr.py
 delete mode 100644 tests/transformers/models/dpt/__init__.py
 delete mode 100644 tests/transformers/models/dpt/test_image_processing_dpt.py
 delete mode 100644 tests/transformers/models/dpt/test_modeling_dpt.py
 delete mode 100644 tests/transformers/models/dpt/test_modeling_dpt_auto_backbone.py
 delete mode 100644 tests/transformers/models/dpt/test_modeling_dpt_hybrid.py
 delete mode 100644 tests/transformers/models/efficientformer/__init__.py
 delete mode 100644 tests/transformers/models/efficientformer/test_image_processing_efficientformer.py
 delete mode 100644 tests/transformers/models/efficientformer/test_modeling_efficientformer.py
 delete mode 100644 tests/transformers/models/electra/__init__.py
 delete mode 100644 tests/transformers/models/electra/test_modeling_electra.py
 delete mode 100644 tests/transformers/models/encodec/__init__.py
 delete mode 100644 tests/transformers/models/encodec/test_modeling_encodec.py
 delete mode 100644 tests/transformers/models/ernie/__init__.py
 delete mode 100644 tests/transformers/models/ernie/test_modeling_ernie.py
 delete mode 100644 tests/transformers/models/ernie_m/__init__.py
 delete mode 100644 tests/transformers/models/ernie_m/test_modeling_ernie_m.py
 delete mode 100644 tests/transformers/models/esm/__init__.py
 delete mode 100644 tests/transformers/models/esm/test_modeling_esm.py
 delete mode 100644 tests/transformers/models/esm/test_modeling_esmfold.py
 delete mode 100644 tests/transformers/models/falcon/__init__.py
 delete mode 100644 tests/transformers/models/falcon/test_modeling_falcon.py
 delete mode 100644 tests/transformers/models/fastspeech2_conformer/__init__.py
 delete mode 100644 tests/transformers/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
 delete mode 100644 tests/transformers/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py
 delete mode 100644 tests/transformers/models/flaubert/__init__.py
 delete mode 100644 tests/transformers/models/flaubert/test_modeling_flaubert.py
 delete mode 100644 tests/transformers/models/flaubert/test_tokenization_flaubert.py
 delete mode 100644 tests/transformers/models/flava/__init__.py
 delete mode 100644 tests/transformers/models/flava/test_image_processing_flava.py
 delete mode 100644 tests/transformers/models/flava/test_modeling_flava.py
 delete mode 100644 tests/transformers/models/flava/test_processor_flava.py
 delete mode 100644 tests/transformers/models/florence2/__init__.py
 delete mode 100644 tests/transformers/models/florence2/testing_modeling_florence2.py
 delete mode 100644 tests/transformers/models/fnet/__init__.py
 delete mode 100644 tests/transformers/models/fnet/test_modeling_fnet.py
 delete mode 100644 tests/transformers/models/focalnet/__init__.py
 delete mode 100644 tests/transformers/models/focalnet/test_modeling_focalnet.py
 delete mode 100644 tests/transformers/models/fsmt/__init__.py
 delete mode 100644 tests/transformers/models/fsmt/test_modeling_fsmt.py
 delete mode 100644 tests/transformers/models/funnel/__init__.py
 delete mode 100644 tests/transformers/models/funnel/test_modeling_funnel.py
 delete mode 100644 tests/transformers/models/funnel/test_tokenization_funnel.py
 delete mode 100644 tests/transformers/models/fuyu/__init__.py
 delete mode 100644 tests/transformers/models/fuyu/test_modeling_fuyu.py
 delete mode 100644 tests/transformers/models/gemma/__init__.py
 delete mode 100644 tests/transformers/models/gemma/test_modeling_gemma.py
 delete mode 100644 tests/transformers/models/gemma2/__init__.py
 delete mode 100644 tests/transformers/models/gemma2/test_modeling_gemma2.py
 delete mode 100644 tests/transformers/models/git/__init__.py
 delete mode 100644 tests/transformers/models/git/test_modeling_git.py
 delete mode 100644 tests/transformers/models/git/test_processor_git.py
 delete mode 100644 tests/transformers/models/gpt2/__init__.py
 delete mode 100644 tests/transformers/models/gpt2/test_modeling_gpt2.py
 delete mode 100644 tests/transformers/models/gpt_bigcode/__init__.py
 delete mode 100644 tests/transformers/models/gpt_bigcode/test_modeling_gpt_bigcode.py
 delete mode 100644 tests/transformers/models/gpt_neo/__init__.py
 delete mode 100644 tests/transformers/models/gpt_neo/test_modeling_gpt_neo.py
 delete mode 100644 tests/transformers/models/gpt_neox/__init__.py
 delete mode 100644 tests/transformers/models/gpt_neox/test_modeling_gpt_neox.py
 delete mode 100644 tests/transformers/models/gpt_neox_japanese/__init__.py
 delete mode 100644 tests/transformers/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
 delete mode 100644 tests/transformers/models/gptj/__init__.py
 delete mode 100644 tests/transformers/models/gptj/test_modeling_gptj.py
 delete mode 100644 tests/transformers/models/gptsan_japanese/__init__.py
 delete mode 100644 tests/transformers/models/gptsan_japanese/test_modeling_gptsan_japanese.py
 delete mode 100644 tests/transformers/models/graphormer/__init__.py
 delete mode 100644 tests/transformers/models/graphormer/test_graphormer_cells.py
 delete mode 100644 tests/transformers/models/graphormer/test_modeling_graphormer.py
 delete mode 100644 tests/transformers/models/groupvit/__init__.py
 delete mode 100644 tests/transformers/models/groupvit/test_modeling_groupvit.py
 delete mode 100644 tests/transformers/models/hubert/__init__.py
 delete mode 100644 tests/transformers/models/hubert/test_modeling_hubert.py
 delete mode 100644 tests/transformers/models/ibert/__init__.py
 delete mode 100644 tests/transformers/models/ibert/test_modeling_ibert.py
 delete mode 100644 tests/transformers/models/idefics/__init__.py
 delete mode 100644 tests/transformers/models/idefics/test_image_processing_idefics.py
 delete mode 100644 tests/transformers/models/idefics/test_modeling_idefics.py
 delete mode 100644 tests/transformers/models/idefics/test_processor_idefics.py
 delete mode 100644 tests/transformers/models/imagegpt/__init__.py
 delete mode 100644 tests/transformers/models/imagegpt/test_image_processing_imagegpt.py
 delete mode 100644 tests/transformers/models/imagegpt/test_modeling_imagegpt.py
 delete mode 100644 tests/transformers/models/instructblip/__init__.py
 delete mode 100644 tests/transformers/models/instructblip/test_modeling_instructblip.py
 delete mode 100644 tests/transformers/models/internlm/__init__.py
 delete mode 100644 tests/transformers/models/internlm/test_modeling_internlm.py
 delete mode 100644 tests/transformers/models/jamba/__init__.py
 delete mode 100644 tests/transformers/models/jamba/test_modeling_jamba.py
 delete mode 100644 tests/transformers/models/jetmoe/__init__.py
 delete mode 100644 tests/transformers/models/jetmoe/test_modeling_jetmoe.py
 delete mode 100644 tests/transformers/models/kosmos2/__init__.py
 delete mode 100644 tests/transformers/models/kosmos2/test_modeling_kosmos2.py
 delete mode 100644 tests/transformers/models/kosmos2/test_processor_kosmos2.py
 delete mode 100644 tests/transformers/models/layoutlm/__init__.py
 delete mode 100644 tests/transformers/models/layoutlm/test_modeling_layoutlm.py
 delete mode 100644 tests/transformers/models/layoutlmv2/__init__.py
 delete mode 100644 tests/transformers/models/layoutlmv2/test_modeling_layoutlmv2.py
 delete mode 100644 tests/transformers/models/layoutlmv3/__init__.py
 delete mode 100644 tests/transformers/models/layoutlmv3/test_modeling_layoutlmv3.py
 delete mode 100644 tests/transformers/models/led/__init__.py
 delete mode 100644 tests/transformers/models/led/test_modeling_led.py
 delete mode 100644 tests/transformers/models/lilt/__init__.py
 delete mode 100644 tests/transformers/models/lilt/test_modeling_lilt.py
 delete mode 100644 tests/transformers/models/llama/__init__.py
 delete mode 100644 tests/transformers/models/llama/test_modeling_llama.py
 delete mode 100644 tests/transformers/models/llava/__init__.py
 delete mode 100644 tests/transformers/models/llava/test_modeling_llava.py
 delete mode 100644 tests/transformers/models/llava_next/__init__.py
 delete mode 100644 tests/transformers/models/llava_next/test_image_processor_llava_next.py
 delete mode 100644 tests/transformers/models/llava_next/test_modeling_llava_next.py
 delete mode 100644 tests/transformers/models/longformer/__init__.py
 delete mode 100644 tests/transformers/models/longformer/test_modeling_longformer.py
 delete mode 100644 tests/transformers/models/luke/__init__.py
 delete mode 100644 tests/transformers/models/luke/test_modeling_luke.py
 delete mode 100644 tests/transformers/models/lxmert/__init__.py
 delete mode 100644 tests/transformers/models/lxmert/test_modeling_lxmert.py
 delete mode 100644 tests/transformers/models/m2m_100/__init__.py
 delete mode 100644 tests/transformers/models/m2m_100/test_modeling_m2m_100.py
 delete mode 100644 tests/transformers/models/m2m_100/test_tokenization_m2m_100.py
 delete mode 100644 tests/transformers/models/mamba/__init__.py
 delete mode 100644 tests/transformers/models/mamba/test_modeling_graph_mamba.py
 delete mode 100644 tests/transformers/models/mamba/test_modeling_mamba.py
 delete mode 100644 tests/transformers/models/marian/__init__.py
 delete mode 100644 tests/transformers/models/marian/test_modeling_marian.py
 delete mode 100644 tests/transformers/models/markuplm/__init__.py
 delete mode 100644 tests/transformers/models/markuplm/test_modeling_markuplm.py
 delete mode 100644 tests/transformers/models/mask2former/__init__.py
 delete mode 100644 tests/transformers/models/mask2former/test_image_processing_mask2former.py
 delete mode 100644 tests/transformers/models/mask2former/test_modeling_mask2former.py
 delete mode 100644 tests/transformers/models/maskformer/__init__.py
 delete mode 100644 tests/transformers/models/maskformer/test_modeling_maskformer.py
 delete mode 100644 tests/transformers/models/mbart/__init__.py
 delete mode 100644 tests/transformers/models/mbart/test_modeling_mbart.py
 delete mode 100644 tests/transformers/models/mbart50/__init__.py
 delete mode 100644 tests/transformers/models/mbart50/test_tokenization_mbart50.py
 delete mode 100644 tests/transformers/models/mctct/__init__.py
 delete mode 100644 tests/transformers/models/mctct/test_feature_extraction_mctct.py
 delete mode 100644 tests/transformers/models/mctct/test_modeling_mctct.py
 delete mode 100644 tests/transformers/models/mctct/test_processor_mctct.py
 delete mode 100644 tests/transformers/models/megatron_bert/__init__.py
 delete mode 100644 tests/transformers/models/megatron_bert/test_modeling_megatron_bert.py
 delete mode 100644 tests/transformers/models/mgp_str/__init__.py
 delete mode 100644 tests/transformers/models/mgp_str/test_modeling_mgp_str.py
 delete mode 100644 tests/transformers/models/mgp_str/test_processor_mgp_str.py
 delete mode 100644 tests/transformers/models/mistral/__init__.py
 delete mode 100644 tests/transformers/models/mistral/test_modeling_mistral.py
 delete mode 100644 tests/transformers/models/mixtral/__init__.py
 delete mode 100644 tests/transformers/models/mixtral/test_modeling_mixtral.py
 delete mode 100644 tests/transformers/models/mllama/__init__.py
 delete mode 100644 tests/transformers/models/mllama/test_image_processing_mllama.py
 delete mode 100644 tests/transformers/models/mllama/test_modeling_mllama.py
 delete mode 100644 tests/transformers/models/mllama/test_processor_mllama.py
 delete mode 100644 tests/transformers/models/mluke/__init__.py
 delete mode 100644 tests/transformers/models/mluke/test_tokenization_mluke.py
 delete mode 100644 tests/transformers/models/mobilebert/__init__.py
 delete mode 100644 tests/transformers/models/mobilenet_v1/__init__.py
 delete mode 100644 tests/transformers/models/mobilenet_v1/test_image_processing_mobilenet_v1.py
 delete mode 100644 tests/transformers/models/mobilenet_v1/test_modeling_mobilenet_v1.py
 delete mode 100644 tests/transformers/models/mobilenet_v2/__init__.py
 delete mode 100644 tests/transformers/models/mobilenet_v2/test_image_processing_mobilenet_v2.py
 delete mode 100644 tests/transformers/models/mobilenet_v2/test_modeling_mobilenet_v2.py
 delete mode 100644 tests/transformers/models/mobilevit/__init__.py
 delete mode 100644 tests/transformers/models/mobilevit/test_image_processing_mobilevit.py
 delete mode 100644 tests/transformers/models/mobilevit/test_modeling_mobilevit.py
 delete mode 100644 tests/transformers/models/mobilevitv2/__init__.py
 delete mode 100644 tests/transformers/models/mobilevitv2/test_modeling_mobilevitv2.py
 delete mode 100644 tests/transformers/models/mpnet/__init__.py
 delete mode 100644 tests/transformers/models/mpnet/test_modeling_mpnet.py
 delete mode 100644 tests/transformers/models/mpt/__init__.py
 delete mode 100644 tests/transformers/models/mpt/test_modeling_mpt.py
 delete mode 100644 tests/transformers/models/mt5/__init__.py
 delete mode 100644 tests/transformers/models/mt5/test_modeling_mt5.py
 delete mode 100644 tests/transformers/models/musicgen/__init__.py
 delete mode 100644 tests/transformers/models/musicgen/test_modeling_musicgen.py
 delete mode 100644 tests/transformers/models/musicgen_melody/__init__.py
 delete mode 100644 tests/transformers/models/musicgen_melody/test_feature_extraction_musicgen_melody.py
 delete mode 100644 tests/transformers/models/musicgen_melody/test_modeling_musicgen_melody.py
 delete mode 100644 tests/transformers/models/musicgen_melody/test_processor_musicgen_melody.py
 delete mode 100644 tests/transformers/models/mvp/__init__.py
 delete mode 100644 tests/transformers/models/mvp/test_modeling_mvp.py
 delete mode 100644 tests/transformers/models/nllb/__init__.py
 delete mode 100644 tests/transformers/models/nllb/test_tokenization_nllb.py
 delete mode 100644 tests/transformers/models/nllb_moe/__init__.py
 delete mode 100644 tests/transformers/models/nllb_moe/test_modeling_nllb_moe.py
 delete mode 100644 tests/transformers/models/nougat/__init__.py
 delete mode 100644 tests/transformers/models/nougat/test_image_processing_nougat.py
 delete mode 100644 tests/transformers/models/nougat/test_modeling_nougat.py
 delete mode 100644 tests/transformers/models/nystromformer/__init__.py
 delete mode 100644 tests/transformers/models/nystromformer/test_modeling_nystromformer.py
 delete mode 100644 tests/transformers/models/olmo/__init__.py
 delete mode 100644 tests/transformers/models/olmo/test_modeling_olmo.py
 delete mode 100644 tests/transformers/models/oneformer/__init__.py
 delete mode 100644 tests/transformers/models/oneformer/test_image_processing_oneformer.py
 delete mode 100644 tests/transformers/models/oneformer/test_modeling_oneformer.py
 delete mode 100644 tests/transformers/models/oneformer/test_processor_oneformer.py
 delete mode 100644 tests/transformers/models/openai/__init__.py
 delete mode 100644 tests/transformers/models/openai/test_modeling_gpt.py
 delete mode 100644 tests/transformers/models/opt/__init__.py
 delete mode 100644 tests/transformers/models/opt/test_modeling_opt.py
 delete mode 100644 tests/transformers/models/owlv2/__init__.py
 delete mode 100644 tests/transformers/models/owlv2/test_image_processor_owlv2.py
 delete mode 100644 tests/transformers/models/owlv2/test_modeling_owlv2.py
 delete mode 100644 tests/transformers/models/owlvit/__init__.py
 delete mode 100644 tests/transformers/models/owlvit/test_image_processing_owlvit.py
 delete mode 100644 tests/transformers/models/owlvit/test_modeling_owlvit.py
 delete mode 100644 tests/transformers/models/owlvit/test_processor_owlvit.py
 delete mode 100644 tests/transformers/models/patchtst/__init__.py
 delete mode 100644 tests/transformers/models/patchtst/test_modeling_patchtst.py
 delete mode 100644 tests/transformers/models/pegasus/__init__.py
 delete mode 100644 tests/transformers/models/pegasus/test_modeling_pegasus.py
 delete mode 100644 tests/transformers/models/pegasus_x/__init__.py
 delete mode 100644 tests/transformers/models/pegasus_x/test_modeling_pegasus_x.py
 delete mode 100644 tests/transformers/models/perceiver/__init__.py
 delete mode 100644 tests/transformers/models/perceiver/test_modeling_perceiver.py
 delete mode 100644 tests/transformers/models/perceiver/test_tokenization_perceiver.py
 delete mode 100644 tests/transformers/models/persimmon/__init__.py
 delete mode 100644 tests/transformers/models/persimmon/test_modeling_persimmon.py
 delete mode 100644 tests/transformers/models/phi/__init__.py
 delete mode 100644 tests/transformers/models/phi/test_modeling_phi.py
 delete mode 100644 tests/transformers/models/phi3/__init__.py
 delete mode 100644 tests/transformers/models/phi3/test_modeling_phi3.py
 delete mode 100644 tests/transformers/models/pix2struct/__init__.py
 delete mode 100644 tests/transformers/models/pix2struct/test_image_processing_pix2struct.py
 delete mode 100644 tests/transformers/models/pix2struct/test_modeling_pix2struct.py
 delete mode 100644 tests/transformers/models/pix2struct/test_processor_pix2struct.py
 delete mode 100644 tests/transformers/models/plbart/__init__.py
 delete mode 100644 tests/transformers/models/plbart/test_modeling_plbart.py
 delete mode 100644 tests/transformers/models/poolformer/__init__.py
 delete mode 100644 tests/transformers/models/poolformer/test_image_processing_poolformer.py
 delete mode 100644 tests/transformers/models/poolformer/test_modeling_poolformer.py
 delete mode 100644 tests/transformers/models/pop2piano/__init__.py
 delete mode 100644 tests/transformers/models/pop2piano/test_modeling_pop2piano.py
 delete mode 100644 tests/transformers/models/prophetnet/__init__.py
 delete mode 100644 tests/transformers/models/prophetnet/test_modeling_prophetnet.py
 delete mode 100644 tests/transformers/models/prophetnet/test_tokenization_prophetnet.py
 delete mode 100644 tests/transformers/models/qdqbert/__init__.py
 delete mode 100644 tests/transformers/models/qdqbert/test_modeling_qdqbert.py
 delete mode 100644 tests/transformers/models/qwen2/__init__.py
 delete mode 100644 tests/transformers/models/qwen2/test_modeling_qwen2.py
 delete mode 100644 tests/transformers/models/qwen2_moe/__init__.py
 delete mode 100644 tests/transformers/models/qwen2_moe/test_modeling_qwen2_moe.py
 delete mode 100644 tests/transformers/models/qwen2_vl/__init__.py
 delete mode 100644 tests/transformers/models/qwen2_vl/test_modeling_qwen2_vl.py
 delete mode 100644 tests/transformers/models/rag/__init__.py
 delete mode 100644 tests/transformers/models/rag/test_modeling_rag.py
 delete mode 100644 tests/transformers/models/rag/test_retrieval_rag.py
 delete mode 100644 tests/transformers/models/rag/test_tokenization_rag.py
 delete mode 100644 tests/transformers/models/realm/__init__.py
 delete mode 100644 tests/transformers/models/realm/test_modeling_realm.py
 delete mode 100644 tests/transformers/models/reformer/__init__.py
 delete mode 100644 tests/transformers/models/reformer/test_modeling_reformer.py
 delete mode 100644 tests/transformers/models/rembert/__init__.py
 delete mode 100644 tests/transformers/models/rembert/test_modeling_rembert.py
 delete mode 100644 tests/transformers/models/resnet/__init__.py
 delete mode 100644 tests/transformers/models/resnet/test_modeling_resnet.py
 delete mode 100644 tests/transformers/models/roberta/__init__.py
 delete mode 100644 tests/transformers/models/roberta/test_modeling_roberta.py
 delete mode 100644 tests/transformers/models/roberta_prelayernorm/__init__.py
 delete mode 100644 tests/transformers/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
 delete mode 100644 tests/transformers/models/roc_bert/__init__.py
 delete mode 100644 tests/transformers/models/roc_bert/test_modeling_roc_bert.py
 delete mode 100644 tests/transformers/models/rwkv/__init__.py
 delete mode 100644 tests/transformers/models/rwkv/test_modeling_rwkv.py
 delete mode 100644 tests/transformers/models/sam/__init__.py
 delete mode 100644 tests/transformers/models/sam/test_modeling_sam.py
 delete mode 100644 tests/transformers/models/sam/test_processor_sam.py
 delete mode 100644 tests/transformers/models/seamless_m4t/__init__.py
 delete mode 100644 tests/transformers/models/seamless_m4t/test_modeling_seamless_m4t.py
 delete mode 100644 tests/transformers/models/seamless_m4t_v2/__init__.py
 delete mode 100644 tests/transformers/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
 delete mode 100644 tests/transformers/models/segformer/__init__.py
 delete mode 100644 tests/transformers/models/segformer/test_modeling_segformer.py
 delete mode 100644 tests/transformers/models/seggpt/__init__.py
 delete mode 100644 tests/transformers/models/seggpt/test_image_processing_seggpt.py
 delete mode 100644 tests/transformers/models/seggpt/test_modeling_seggpt.py
 delete mode 100644 tests/transformers/models/sew/__init__.py
 delete mode 100644 tests/transformers/models/sew/test_modeling_sew.py
 delete mode 100644 tests/transformers/models/sew_d/__init__.py
 delete mode 100644 tests/transformers/models/sew_d/test_modeling_sew_d.py
 delete mode 100644 tests/transformers/models/speech_encoder_decoder/__init__.py
 delete mode 100644 tests/transformers/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
 delete mode 100644 tests/transformers/models/speech_to_text/__init__.py
 delete mode 100644 tests/transformers/models/speech_to_text/test_modeling_speech_to_text.py
 delete mode 100644 tests/transformers/models/speech_to_text/test_tokenization_speech_to_text.py
 delete mode 100644 tests/transformers/models/speecht5/__init__.py
 delete mode 100644 tests/transformers/models/speecht5/test_feature_extraction_speecht5.py
 delete mode 100644 tests/transformers/models/speecht5/test_modeling_speecht5.py
 delete mode 100644 tests/transformers/models/speecht5/test_processor_speecht5.py
 delete mode 100644 tests/transformers/models/speecht5/test_tokenization_speecht5.py
 delete mode 100644 tests/transformers/models/splinter/__init__.py
 delete mode 100644 tests/transformers/models/splinter/test_modeling_splinter.py
 delete mode 100644 tests/transformers/models/squeezebert/__init__.py
 delete mode 100644 tests/transformers/models/squeezebert/test_modeling_squeezebert.py
 delete mode 100644 tests/transformers/models/stablelm/__init__.py
 delete mode 100644 tests/transformers/models/stablelm/test_modeling_stablelm.py
 delete mode 100644 tests/transformers/models/starcoder2/__init__.py
 delete mode 100644 tests/transformers/models/starcoder2/test_modeling_starcoder2.py
 delete mode 100644 tests/transformers/models/superpoint/__init__.py
 delete mode 100644 tests/transformers/models/superpoint/test_image_processing_superpoint.py
 delete mode 100644 tests/transformers/models/superpoint/test_modeling_superpoint.py
 delete mode 100644 tests/transformers/models/swiftformer/__init__.py
 delete mode 100644 tests/transformers/models/swiftformer/test_modeling_swiftformer.py
 delete mode 100644 tests/transformers/models/swin/__init__.py
 delete mode 100644 tests/transformers/models/swin/test_modeling_swin.py
 delete mode 100644 tests/transformers/models/swin2sr/__init__.py
 delete mode 100644 tests/transformers/models/swin2sr/test_image_processing_swin2sr.py
 delete mode 100644 tests/transformers/models/swin2sr/test_modeling_swin2sr.py
 delete mode 100644 tests/transformers/models/switch_transformers/__init__.py
 delete mode 100644 tests/transformers/models/switch_transformers/test_modeling_switch_transformers.py
 delete mode 100644 tests/transformers/models/t5/__init__.py
 delete mode 100644 tests/transformers/models/t5/test_modeling_t5.py
 delete mode 100644 tests/transformers/models/tapas/__init__.py
 delete mode 100644 tests/transformers/models/tapas/test_modeling_tapas.py
 delete mode 100644 tests/transformers/models/tapas/test_tokenization_tapas.py
 delete mode 100644 tests/transformers/models/tapex/__init__.py
 delete mode 100644 tests/transformers/models/tapex/test_tokenization_tapex.py
 delete mode 100644 tests/transformers/models/time_series_transformer/__init__.py
 delete mode 100644 tests/transformers/models/time_series_transformer/test_modeling_time_series_transformer.py
 delete mode 100644 tests/transformers/models/timesformer/__init__.py
 delete mode 100644 tests/transformers/models/timesformer/test_modeling_timesformer.py
 delete mode 100644 tests/transformers/models/tinybert/__init__.py
 delete mode 100644 tests/transformers/models/trocr/__init__.py
 delete mode 100644 tests/transformers/models/trocr/test_modeling_trocr.py
 delete mode 100644 tests/transformers/models/tvlt/__init__.py
 delete mode 100644 tests/transformers/models/tvlt/test_modeling_tvlt.py
 delete mode 100644 tests/transformers/models/udop/__init__.py
 delete mode 100644 tests/transformers/models/udop/test_modeling_udop.py
 delete mode 100644 tests/transformers/models/udop/test_processor_udop.py
 delete mode 100644 tests/transformers/models/udop/test_tokenization_udop.py
 delete mode 100644 tests/transformers/models/umt5/__init__.py
 delete mode 100644 tests/transformers/models/umt5/test_modeling_umt5.py
 delete mode 100644 tests/transformers/models/unispeech/__init__.py
 delete mode 100644 tests/transformers/models/unispeech/test_modeling_unispeech.py
 delete mode 100644 tests/transformers/models/unispeech_sat/__init__.py
 delete mode 100644 tests/transformers/models/unispeech_sat/test_modeling_unispeech_sat.py
 delete mode 100644 tests/transformers/models/univnet/__init__.py
 delete mode 100644 tests/transformers/models/univnet/test_feature_extraction_univnet.py
 delete mode 100644 tests/transformers/models/univnet/test_modeling_univnet.py
 delete mode 100644 tests/transformers/models/upernet/__init__.py
 delete mode 100644 tests/transformers/models/upernet/test_modeling_upernet.py
 delete mode 100644 tests/transformers/models/videomae/__init__.py
 delete mode 100644 tests/transformers/models/videomae/test_image_processing_videomae.py
 delete mode 100644 tests/transformers/models/videomae/test_modeling_videomae.py
 delete mode 100644 tests/transformers/models/vilt/__init__.py
 delete mode 100644 tests/transformers/models/vilt/test_image_processing_vilt.py
 delete mode 100644 tests/transformers/models/vilt/test_modeling_vilt.py
 delete mode 100644 tests/transformers/models/vipllava/__init__.py
 delete mode 100644 tests/transformers/models/vipllava/test_modeling_vipllava.py
 delete mode 100644 tests/transformers/models/vision_encoder_decoder/__init__.py
 delete mode 100644 tests/transformers/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
 delete mode 100644 tests/transformers/models/vision_text_dual_encoder/__init__.py
 delete mode 100644 tests/transformers/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
 delete mode 100644 tests/transformers/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
 delete mode 100644 tests/transformers/models/visual_bert/__init__.py
 delete mode 100644 tests/transformers/models/visual_bert/test_modeling_visual_bert.py
 delete mode 100644 tests/transformers/models/vit/__init__.py
 delete mode 100644 tests/transformers/models/vit/test_image_processing_vit.py
 delete mode 100644 tests/transformers/models/vit/test_modeling_vit.py
 delete mode 100644 tests/transformers/models/vit_hybrid/__init__.py
 delete mode 100644 tests/transformers/models/vit_hybrid/test_modeling_vit_hybrid.py
 delete mode 100644 tests/transformers/models/vit_mae/__init__.py
 delete mode 100644 tests/transformers/models/vit_mae/test_modeling_vit_mae.py
 delete mode 100644 tests/transformers/models/vit_msn/__init__.py
 delete mode 100644 tests/transformers/models/vit_msn/test_modeling_vit_msn.py
 delete mode 100644 tests/transformers/models/vitdet/__init__.py
 delete mode 100644 tests/transformers/models/vitdet/test_modeling_vitdet.py
 delete mode 100644 tests/transformers/models/vitmatte/__init__.py
 delete mode 100644 tests/transformers/models/vitmatte/test_image_processing_vitmatte.py
 delete mode 100644 tests/transformers/models/vitmatte/test_modeling_vitmatte.py
 delete mode 100644 tests/transformers/models/vits/__init__.py
 delete mode 100644 tests/transformers/models/vits/test_modeling_vits.py
 delete mode 100644 tests/transformers/models/vivit/__init__.py
 delete mode 100644 tests/transformers/models/vivit/test_image_processing_vit.py
 delete mode 100644 tests/transformers/models/vivit/test_modeling_vivit.py
 delete mode 100644 tests/transformers/models/wav2vec2/__init__.py
 delete mode 100644 tests/transformers/models/wav2vec2/test_feature_extraction_wav2vec2.py
 delete mode 100644 tests/transformers/models/wav2vec2/test_modeling_wav2vec2.py
 delete mode 100644 tests/transformers/models/wav2vec2/test_processor_wav2vec2.py
 delete mode 100644 tests/transformers/models/wav2vec2/test_tokenization_wav2vec2.py
 delete mode 100644 tests/transformers/models/wav2vec2_bert/__init__.py
 delete mode 100644 tests/transformers/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
 delete mode 100644 tests/transformers/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
 delete mode 100644 tests/transformers/models/wav2vec2_conformer/__init__.py
 delete mode 100644 tests/transformers/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
 delete mode 100644 tests/transformers/models/wavlm/__init__.py
 delete mode 100644 tests/transformers/models/wavlm/test_modeling_wavlm.py
 delete mode 100644 tests/transformers/models/whisper/__init__.py
 delete mode 100644 tests/transformers/models/whisper/test_modeling_whisper.py
 delete mode 100644 tests/transformers/models/x_clip/__init__.py
 delete mode 100644 tests/transformers/models/x_clip/test_modeling_x_clip.py
 delete mode 100644 tests/transformers/models/xlm/__init__.py
 delete mode 100644 tests/transformers/models/xlm/test_modeling_xlm.py
 delete mode 100644 tests/transformers/models/xlm/test_tokenization_xlm.py
 delete mode 100644 tests/transformers/models/xlm_prophetnet/__init__.py
 delete mode 100644 tests/transformers/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py
 delete mode 100644 tests/transformers/models/xlm_roberta/__init__.py
 delete mode 100644 tests/transformers/models/xlm_roberta/test_modeling_xlm_roberta.py
 delete mode 100644 tests/transformers/models/xlm_roberta_xl/__init__.py
 delete mode 100644 tests/transformers/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
 delete mode 100644 tests/transformers/models/xlnet/__init__.py
 delete mode 100644 tests/transformers/models/xlnet/test_modeling_xlnet.py
 delete mode 100644 tests/transformers/models/xlnet/test_tokenization_xlnet.py
 delete mode 100644 tests/transformers/models/xmod/__init__.py
 delete mode 100644 tests/transformers/models/xmod/test_modeling_xmod.py
 delete mode 100644 tests/transformers/models/yolos/__init__.py
 delete mode 100644 tests/transformers/models/yolos/test_image_processing_yolos.py
 delete mode 100644 tests/transformers/models/yolos/test_modeling_yolos.py
 delete mode 100644 tests/transformers/pipelines/__init__.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_audio_classification.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_automatic_speech_recognition.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_common.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_depth_estimation.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_document_question_answering.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_feature_extraction.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_fill_mask.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_image_classification.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_image_feature_extraction.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_image_segmentation.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_question_answering.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_table_question_answering.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_text2text_generation.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_text_classification.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_text_generation.py
 delete mode 100644 tests/transformers/pipelines/test_pipelines_zero_shot_classification.py
 delete mode 100644 tests/transformers/test_backbone_common.py
 delete mode 100644 tests/transformers/test_configuration_common.py
 delete mode 100644 tests/transformers/test_feature_extraction_common.py
 delete mode 100644 tests/transformers/test_image_processing_common.py
 delete mode 100644 tests/transformers/test_modeling_common.py
 delete mode 100644 tests/transformers/test_pipeline_mixin.py
 delete mode 100644 tests/transformers/test_sequence_feature_extraction_common.py
 delete mode 100644 tests/transformers/test_tokenization_common.py

diff --git a/.github/workflows/ci_pipeline.yaml b/.github/workflows/ci_pipeline.yaml
index 2daa959a3..341fe2f14 100644
--- a/.github/workflows/ci_pipeline.yaml
+++ b/.github/workflows/ci_pipeline.yaml
@@ -132,7 +132,9 @@ jobs:
         pip install mindspore
     - name: Test with pytest
       run: |
-        pytest -vs tests/transformers/models/${{ matrix.alpha }}*/test_modeling*
+        pip install transformers==4.51.2
+        git clone -b 4.51.2 https://gitee.com/mirrors/huggingface_transformers
+        python tests/run_test.py -vs huggingface_transformers/tests/models/${{ matrix.alpha }}*/test_modeling*
 
   kaggle-gpu-test:
     needs: pylint-check
diff --git a/.gitignore b/.gitignore
index d6055289a..ceb763309 100644
--- a/.gitignore
+++ b/.gitignore
@@ -164,4 +164,6 @@ aclinit.json
 xiyouji.txt
 *.safetensors
 *.jit
-flagged/
\ No newline at end of file
+flagged/
+
+huggingface_transformers/
\ No newline at end of file
diff --git a/mindnlp/__init__.py b/mindnlp/__init__.py
index 27e734e48..135c44510 100644
--- a/mindnlp/__init__.py
+++ b/mindnlp/__init__.py
@@ -17,7 +17,6 @@
 MindNLP library.
 """
 import os
-import sys
 import platform
 from packaging import version
 
@@ -51,6 +50,3 @@
 initialize_torch_proxy()
 setup_metadata_patch()
 setup_safetensors_patch()
-
-from . import core
-from . import transformers
diff --git a/mindnlp/core/__init__.py b/mindnlp/core/__init__.py
index 1a5ce8335..a4b91704a 100644
--- a/mindnlp/core/__init__.py
+++ b/mindnlp/core/__init__.py
@@ -36,7 +36,7 @@
 from ._C import *
 from ._dtype import *
 from ._tensor import Tensor, tensor, is_tensor, \
-    LongTensor, FloatTensor, BoolTensor, HalfTensor, BFloat16Tensor
+    LongTensor, FloatTensor, BoolTensor, HalfTensor, BFloat16Tensor, IntTensor
 from .types import device
 from ._C.size import Size
 from .types import device
@@ -46,9 +46,11 @@
 from ._bind import get_default_dtype, set_default_dtype
 
 from . import profiler, cuda, optim, amp, compiler, jit, version, __future__, overrides, \
-    return_types, linalg
+    return_types, linalg, fx
 
 from ._lowrank import svd_lowrank
+from .random import get_rng_state, initial_seed, manual_seed, seed, set_rng_state
+
 
 def _has_compatible_shallow_copy_type(tensor, other):
     """
@@ -70,4 +72,11 @@ def _has_compatible_shallow_copy_type(tensor, other):
         return False
 
     # Compatibility confirmed
-    return True
\ No newline at end of file
+    return True
+
+def compile(fn=None, *args, **kwargs):
+    def wrap_func(fn):
+        return fn
+    if fn is not None:
+        return wrap_func(fn)
+    return wrap_func
\ No newline at end of file
diff --git a/mindnlp/core/_tensor.py b/mindnlp/core/_tensor.py
index ef287c4ba..2722ea67c 100644
--- a/mindnlp/core/_tensor.py
+++ b/mindnlp/core/_tensor.py
@@ -9,11 +9,17 @@
 except:
     class StubTensor: pass
 
+try:
+    from mindspore._c_expression import TensorPy as Tensor_
+except:
+    from mindspore._c_expression import Tensor as Tensor_
+
 from . import ops, _dtype
 from ._dtype import dtype2np
 from ._bind import get_default_device, device_
 from .configs import use_pyboost, ON_A1
 from .storage import UntypedStorage
+from ._utils import _rebuild_tensor_v2
 
 DTYPE_ELEMENT_SIZE_MAP = {
     mindspore.float64: 8,
@@ -31,6 +37,11 @@ def __isinstancecheck__(self, instance):
             return False
         return instance.dtype == self.dtype
 
+class IntTensor(Tensor, metaclass=TypedTensorMeta):
+    dtype = _dtype.int
+    def __init__(self, data, device=None):
+        super().__init__(data, dtype=_dtype.int)
+
 class LongTensor(Tensor, metaclass=TypedTensorMeta):
     dtype = _dtype.long
     def __init__(self, data, device=None):
@@ -77,6 +88,22 @@ def is_tensor(x):
     return isinstance(x, Tensor)
 
 def enable_mindspore_patch():
+    def __reduce_ex__(self, protocol):
+        if isinstance(self, StubTensor):
+            data = Tensor_(self.stub_sync())
+        else:
+            data = Tensor_(self)
+        storage_offset = 0
+        size = data._shape
+        stride = data.stride()
+        requires_grad = False
+        args = (data, storage_offset, size, stride, requires_grad, None, None)
+        return (
+            _rebuild_from_type_v2, (_rebuild_tensor_v2, type(self), args, None))
+
+    Tensor.__reduce_ex__ = __reduce_ex__
+    StubTensor.__reduce_ex__ = __reduce_ex__
+
     def to_(self, *args, **kwargs):
         dtype_to = None
         if len(args) == 1:
@@ -260,3 +287,29 @@ def unfold(self, dimension, size, step):
 
     Tensor.unfold = unfold
     StubTensor.unfold = unfold
+
+    def new(self, data=None):
+        if data is None:
+            return Tensor([], dtype=self.dtype)
+        return Tensor(data, dtype=self.dtype)
+
+    Tensor.new = new
+    StubTensor.new = new
+
+    def view(self, *args):
+        if isinstance(args[0], (tuple, list)):
+            args = args[0]
+        return self.reshape(*args)
+    
+    Tensor.view = view
+    StubTensor.view = view
+
+    def cpu(self):
+        return self
+
+    Tensor.cpu = cpu
+    StubTensor.cpu = cpu
+
+def _rebuild_from_type_v2(func, new_type, args, state):
+    ret = func(*args)
+    return ret
\ No newline at end of file
diff --git a/mindnlp/core/_utils.py b/mindnlp/core/_utils.py
index c2e4a912d..e193fb8fc 100644
--- a/mindnlp/core/_utils.py
+++ b/mindnlp/core/_utils.py
@@ -1,8 +1,16 @@
 import sys
 import traceback
+from functools import reduce
+import operator
 
-
+import numpy as np
 from mindnlp import core
+from .configs import SUPPORT_BF16
+
+if SUPPORT_BF16:
+    from mindspore.common.np_dtype import bfloat16 # pylint: disable=import-error
+else:
+    from ml_dtypes import bfloat16
 
 element_size_map = {
     core.float16: 2,
@@ -62,16 +70,47 @@ def _unflatten_dense_tensors(flat, tensors):
             offset += numel
     return outputs
 
-def _rebuild_tensor_v2(
-    storage,
-    storage_offset,
-    size,
-    stride,
-    requires_grad,
-    backward_hooks,
-    metadata=None,
-):
-    return core.Tensor(storage)
+def _rebuild_tensor_v2(storage, storage_offset, size, stride, requires_grad, backward_hooks=None, metadata=None):
+    '''Rebuilds a tensor based on the provided parameters.
+    
+    Args:
+        storage (ndarray): The storage array from which the tensor is created.
+        storage_offset (int): The offset in the storage array from where the tensor data starts.
+        size (tuple): The size of the tensor.
+        stride (tuple or None): The stride of the tensor, or None if not applicable.
+        requires_grad (bool): Indicates if the tensor requires gradient computation.
+        backward_hooks (list): A list of backward hooks for the tensor.
+        metadata (Any, optional): Additional metadata associated with the tensor.
+    
+    Returns:
+        None: This function does not have a return value.
+    
+    Raises:
+        None: This function does not raise any exceptions.
+    '''
+    if size == ():
+        num_elemets = 1
+    else:
+        num_elemets = reduce(operator.mul, size)
+    array = storage[storage_offset: storage_offset + num_elemets]
+
+    if array.dtype == bfloat16 and not SUPPORT_BF16:
+        array = array.astype(np.float16)
+
+    if stride is not None and len(stride) > 1 and stride[0] == 1:
+        # stride = tuple((s * 4 for s in stride))
+        # # stride = tuple((s * 4 if s != 1 else s for s in stride))
+        # array = np.lib.stride_tricks.as_strided(array, size, stride)
+        order = "F"
+        array = array.reshape(size, order=order)
+    else:
+        order = "C"
+        array = array.reshape(size, order=order)
+    
+    if isinstance(array, np.memmap):
+        array = array.copy()
+    param = core.from_numpy(array)
+    return param
 
 class KeyErrorMessage(str):
     r"""str subclass that returns itself in repr"""
diff --git a/mindnlp/core/distributed/fsdp/__init__.py b/mindnlp/core/distributed/fsdp/__init__.py
index 3b6767333..3ab67e3c0 100644
--- a/mindnlp/core/distributed/fsdp/__init__.py
+++ b/mindnlp/core/distributed/fsdp/__init__.py
@@ -1,2 +1 @@
-class FullyShardedDataParallel:
-    pass
+from .fully_sharded_data_parallel import FullyShardedDataParallel
\ No newline at end of file
diff --git a/mindnlp/core/distributed/fsdp/fully_sharded_data_parallel.py b/mindnlp/core/distributed/fsdp/fully_sharded_data_parallel.py
new file mode 100644
index 000000000..a1f4b4188
--- /dev/null
+++ b/mindnlp/core/distributed/fsdp/fully_sharded_data_parallel.py
@@ -0,0 +1,2 @@
+class FullyShardedDataParallel:
+    pass
diff --git a/mindnlp/core/fx/__init__.py b/mindnlp/core/fx/__init__.py
index ce027f4c0..118893033 100644
--- a/mindnlp/core/fx/__init__.py
+++ b/mindnlp/core/fx/__init__.py
@@ -9,3 +9,8 @@
 from .proxy import Proxy
 
 from . import _pytree
+
+class Graph: pass
+class GraphModule: pass
+class Node: pass
+class Tracer: pass
\ No newline at end of file
diff --git a/tests/peft/__init__.py b/mindnlp/core/fx/_compatibility.py
similarity index 100%
rename from tests/peft/__init__.py
rename to mindnlp/core/fx/_compatibility.py
diff --git a/mindnlp/core/nn/modules/module.py b/mindnlp/core/nn/modules/module.py
index c259a903e..9a18c7e02 100644
--- a/mindnlp/core/nn/modules/module.py
+++ b/mindnlp/core/nn/modules/module.py
@@ -15,7 +15,7 @@ class StubTensor: pass
 from mindnlp import core
 from mindnlp.core import device, dtype, Tensor
 
-from ..parameter import Parameter
+from ..parameter import Parameter, Buffer
 from ...utils import hooks
 from ...utils.hooks import RemovableHandle
 
@@ -544,22 +544,38 @@ def register_parameter(self, name: str, param: Optional[Parameter]) -> None:
                     param = output
             self._parameters[name] = param
 
-    def add_module(self, name, module):
-        """Adds a child module to the current module.
+    def add_module(self, name: str, module: Optional["Module"]) -> None:
+        r"""Add a child module to the current module.
 
         The module can be accessed as an attribute using the given name.
 
         Args:
-            name (string): name of the child module. The child module can be
+            name (str): name of the child module. The child module can be
                 accessed from this module using the given name
-            parameter (Module): child module to be added to the module.
+            module (Module): child module to be added to the module.
         """
         if not isinstance(module, Module) and module is not None:
-            raise TypeError("{} is not a Module subclass".format(type(module)))
-        if hasattr(self, name) and name not in self._modules:
-            raise KeyError("attribute '{}' already exists".format(name))
+            raise TypeError(f"{torch.typename(module)} is not a Module subclass")
+        elif not isinstance(name, str):
+            raise TypeError(
+                f"module name should be a string. Got {torch.typename(name)}"
+            )
+        elif hasattr(self, name) and name not in self._modules:
+            raise KeyError(f"attribute '{name}' already exists")
+        elif "." in name:
+            raise KeyError(f'module name can\'t contain ".", got: {name}')
+        elif name == "":
+            raise KeyError('module name can\'t be empty string ""')
+        for hook in _global_module_registration_hooks.values():
+            output = hook(self, name, module)
+            if output is not None:
+                module = output
         self._modules[name] = module
 
+    def register_module(self, name: str, module: Optional["Module"]) -> None:
+        r"""Alias for :func:`add_module`."""
+        self.add_module(name, module)
+
     def get_parameter(self, target: str) -> "Parameter":
         """Return the parameter given by ``target`` if it exists, otherwise throw an error.
 
@@ -984,7 +1000,7 @@ def __getattr__(self, name):
                 return modules[name]
         raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
 
-    def __setattr__(self, name: str, value: Union[Tensor, 'Module']) -> None:
+    def __setattr__(self, name: str, value: Union[Tensor, "Module"]) -> None:
         def remove_from(*dicts_or_sets):
             for d in dicts_or_sets:
                 if name in d:
@@ -993,54 +1009,97 @@ def remove_from(*dicts_or_sets):
                     else:
                         d.discard(name)
 
-        params = self.__dict__.get('_parameters')
-
+        params = self.__dict__.get("_parameters")
         if isinstance(value, Parameter):
             if params is None:
                 raise AttributeError(
-                    "cannot assign parameters before Module.__init__() call")
-            remove_from(self.__dict__, self._buffers, self._modules, self._non_persistent_buffers_set)
+                    "cannot assign parameters before Module.__init__() call"
+                )
+            remove_from(
+                self.__dict__,
+                self._buffers,
+                self._modules,
+                self._non_persistent_buffers_set,
+            )
             self.register_parameter(name, value)
         elif params is not None and name in params:
             if value is not None:
-                raise TypeError(f"cannot assign '{type(value)}' as parameter '{name}' "
-                                "(core.nn.Parameter or None expected)"
-                                )
+                raise TypeError(
+                    f"cannot assign '{core.typename(value)}' as parameter '{name}' "
+                    "(torch.nn.Parameter or None expected)"
+                )
             self.register_parameter(name, value)
         else:
-            modules = self.__dict__.get('_modules')
-            if isinstance(value, Module):
+            modules = self.__dict__.get("_modules")
+            if isinstance(value, core.nn.Module):
                 if modules is None:
                     raise AttributeError(
-                        "cannot assign module before Module.__init__() call")
-                remove_from(self.__dict__, self._parameters, self._buffers, self._non_persistent_buffers_set)
+                        "cannot assign module before Module.__init__() call"
+                    )
+                remove_from(
+                    self.__dict__,
+                    self._parameters,
+                    self._buffers,
+                    self._non_persistent_buffers_set,
+                )
                 for hook in _global_module_registration_hooks.values():
                     output = hook(self, name, value)
                     if output is not None:
                         value = output
                 modules[name] = value
+
             elif modules is not None and name in modules:
                 if value is not None:
-                    raise TypeError(f"cannot assign '{type(value)}' as child module '{name}' "
-                                    "(nn.Module or None expected)"
-                                    )
+                    raise TypeError(
+                        f"cannot assign '{core.typename(value)}' as child module '{name}' "
+                        "(torch.nn.Module or None expected)"
+                    )
                 for hook in _global_module_registration_hooks.values():
                     output = hook(self, name, value)
                     if output is not None:
                         value = output
                 modules[name] = value
             else:
-                buffers = self.__dict__.get('_buffers')
-                if buffers is not None and name in buffers:
-                    if value is not None and not isinstance(value, Tensor):
-                        raise TypeError(f"cannot assign '{type(value)}' as buffer '{name}' "
-                                        "(core.Tensor or None expected)"
-                                        )
-                    for hook in _global_buffer_registration_hooks.values():
-                        output = hook(self, name, value)
-                        if output is not None:
-                            value = output
-                    buffers[name] = value
+                buffers = self.__dict__.get("_buffers")
+                if isinstance(value, Buffer) or buffers is not None and name in buffers:
+                    if value is not None and not isinstance(value, core.Tensor):
+                        raise TypeError(
+                            f"cannot assign '{core.typename(value)}' as buffer '{name}' "
+                            "(torch.nn.Buffer, torch.Tensor or None expected)"
+                        )
+                    if isinstance(value, Buffer):
+                        persistent = value.persistent
+                    else:
+                        persistent = name not in self._non_persistent_buffers_set
+                    # === HACK ===
+                    # This whole block below should just be:
+                    # self.register_buffer(name, value, persistent)
+
+                    # But to support subclasses of nn.Module that (wrongfully) implement a
+                    # register_buffer() method that doesn't have the "persistent"
+                    # argument. Only pass it in if it is accepted otherwise assume
+                    # it is always true
+                    if (
+                        getattr(self.register_buffer, "__func__", None)
+                        is Module.register_buffer
+                    ):
+                        self.register_buffer(name, value, persistent)
+                    else:
+                        sign = inspect.signature(self.register_buffer)
+                        if "persistent" in sign.parameters:
+                            self.register_buffer(name, value, persistent)
+                        else:
+                            if not persistent:
+                                raise RuntimeError(
+                                    "Registering a non-persistent buffer "
+                                    "on a Module subclass that implements "
+                                    "register_buffer() without the persistent "
+                                    "argument is not allowed."
+                                )
+                            # Assume that the implementation without the argument has the
+                            # behavior from before the argument was added: persistent=True
+                            self.register_buffer(name, value)
+                    # === HACK END ===
                 else:
                     super().__setattr__(name, value)
 
@@ -1394,10 +1453,16 @@ def load(module, local_state_dict, prefix=''):
         return _IncompatibleKeys(missing_keys, unexpected_keys)
 
 
-    def _named_members(self, get_members_fn, prefix='', recurse=True, remove_duplicate: bool = True):
+    def _named_members(
+        self, get_members_fn, prefix="", recurse=True, remove_duplicate: bool = True
+    ):
         r"""Help yield various names + members of modules."""
         memo = set()
-        modules = self.named_modules(prefix=prefix, remove_duplicate=remove_duplicate) if recurse else [(prefix, self)]
+        modules = (
+            self.named_modules(prefix=prefix, remove_duplicate=remove_duplicate)
+            if recurse
+            else [(prefix, self)]
+        )
         for module_prefix, module in modules:
             members = get_members_fn(module)
             for k, v in members:
@@ -1405,7 +1470,7 @@ def _named_members(self, get_members_fn, prefix='', recurse=True, remove_duplica
                     continue
                 if remove_duplicate:
                     memo.add(v)
-                name = module_prefix + ('.' if module_prefix else '') + k
+                name = module_prefix + ("." if module_prefix else "") + k
                 yield name, v
 
     def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
@@ -1500,7 +1565,7 @@ def get_submodule(self, target: str) -> "Module":
 
             mod = getattr(mod, item)
 
-            if not isinstance(mod, Module):
+            if not isinstance(mod, core.nn.Module):
                 raise AttributeError("`" + item + "` is not "
                                      "an nn.Module")
 
@@ -1697,9 +1762,6 @@ def named_modules(self, memo: Optional[Set['Module']] = None, prefix: str = '',
                 submodule_prefix = prefix + ('.' if prefix else '') + name
                 yield from module.named_modules(memo, submodule_prefix, remove_duplicate)
 
-    def cells_and_names(self, cells=None, name_prefix=''):
-        return self.named_modules(cells, name_prefix)
-
     def jit(self, mode=True):
         self.__ms_class__ = mode
         for module in self.children():
diff --git a/mindnlp/core/nn/parameter.py b/mindnlp/core/nn/parameter.py
index 1c89e1954..438098fcd 100644
--- a/mindnlp/core/nn/parameter.py
+++ b/mindnlp/core/nn/parameter.py
@@ -80,3 +80,30 @@ def __init__(self, input_data=None, requires_grad=True):
 
 def is_lazy(param):
     return False
+
+
+class Buffer(Tensor):
+    r"""A kind of Tensor that should not be considered a model
+    parameter. For example, BatchNorm's ``running_mean`` is not a parameter, but is part of the module's state.
+
+    Buffers are :class:`~torch.Tensor` subclasses, that have a
+    very special property when used with :class:`Module` s -- when they're
+    assigned as Module attributes they are automatically added to the list of
+    its buffers, and will appear e.g. in :meth:`~torch.nn.Module.buffers` iterator.
+    Assigning a Tensor doesn't have such effect. One can still assign a Tensor as explicitly by using
+    the :meth:`~torch.nn.Module.register_buffer` function.
+
+    Args:
+        data (Tensor): buffer tensor.
+        persistent (bool, optional): whether the buffer is part of the module's
+            :attr:`state_dict`. Default: ``True``
+    """
+
+    def __new__(cls, data=None, *, persistent=True):
+        if data is None:
+            data = core.empty(0)
+
+        t = data.detach().requires_grad_(data.requires_grad)
+        t.persistent = persistent
+        t._is_buffer = True
+        return t
\ No newline at end of file
diff --git a/mindnlp/core/serialization.py b/mindnlp/core/serialization.py
index 049a81e42..54529129c 100644
--- a/mindnlp/core/serialization.py
+++ b/mindnlp/core/serialization.py
@@ -41,7 +41,11 @@
 import numpy as np
 import mindspore
 
-from mindspore._c_expression import Tensor as MSTensor
+try:
+    from mindspore._c_expression import TensorPy as Tensor_
+except:
+    from mindspore._c_expression import Tensor as Tensor_
+
 from mindspore.train.serialization import _exec_save, _parse_ckpt_proto, tensor_to_np_type, tensor_to_ms_type
 
 import safetensors
@@ -774,45 +778,6 @@ def _rebuild_parameter(data, requires_grad, backward_hooks):
     # OrderedDict.  See Note [Don't serialize hooks]
     return param
 
-def _rebuild_tensor_v2(storage, storage_offset, size, stride, requires_grad, backward_hooks, metadata=None):
-    '''Rebuilds a tensor based on the provided parameters.
-    
-    Args:
-        storage (ndarray): The storage array from which the tensor is created.
-        storage_offset (int): The offset in the storage array from where the tensor data starts.
-        size (tuple): The size of the tensor.
-        stride (tuple or None): The stride of the tensor, or None if not applicable.
-        requires_grad (bool): Indicates if the tensor requires gradient computation.
-        backward_hooks (list): A list of backward hooks for the tensor.
-        metadata (Any, optional): Additional metadata associated with the tensor.
-    
-    Returns:
-        None: This function does not have a return value.
-    
-    Raises:
-        None: This function does not raise any exceptions.
-    '''
-    if size == ():
-        num_elemets = 1
-    else:
-        num_elemets = reduce(operator.mul, size)
-    array = storage[storage_offset: storage_offset + num_elemets]
-
-    if array.dtype == bfloat16 and not SUPPORT_BF16:
-        array = array.astype(np.float16)
-
-    if stride is not None and len(stride) > 1 and stride[0] == 1:
-        # stride = tuple((s * 4 for s in stride))
-        # # stride = tuple((s * 4 if s != 1 else s for s in stride))
-        # array = np.lib.stride_tricks.as_strided(array, size, stride)
-        order = "F"
-        array = array.reshape(size, order=order)
-    else:
-        order = "C"
-        array = array.reshape(size, order=order)
-    param = core.from_numpy(array)
-    return param
-
 def _rebuild_from_type_v2(func, new_type, args, state):
     ret = func(*args)
     return ret
@@ -1038,7 +1003,7 @@ class UnpicklerWrapper(pickle_module.Unpickler):  # type: ignore[name-defined]
         def find_class(self, mod_name, name):
             if name == '_rebuild_tensor_v2':
                 name = '_rebuild_tensor_legacy'
-            if mod_name == 'core._utils':
+            if mod_name == 'torch._utils':
                 return eval(name)
             if mod_name == 'torch':
                 return str(name)
@@ -1224,7 +1189,7 @@ def persistent_load(saved_id):
 
     load_module_mapping: Dict[str, str] = {
         # See https://github.com/pytorch/pytorch/pull/51633
-        'core.tensor': 'core._tensor'
+        'torch.tensor': 'torch._tensor'
     }
 
     # Need to subclass Unpickler instead of directly monkey-patching the find_class method
@@ -1235,12 +1200,8 @@ class UnpicklerWrapper(pickle_module.Unpickler):  # type: ignore[name-defined]
         # Lets us override the imports that pickle uses when unpickling an object.
         # This is useful for maintaining BC if we change a module path that tensor instantiation relies on.
         def find_class(self, mod_name, name):
-            if mod_name == 'torch._utils':
-                return eval(name)
             if mod_name == 'torch':
                 return str(name)
-            if mod_name == 'torch._tensor':
-                return eval(name)
             mod_name = load_module_mapping.get(mod_name, mod_name)
             return super().find_class(mod_name, name)
 
@@ -1306,7 +1267,8 @@ def _check_save_filelike(f):
             "expected 'f' to be string, path, or a file-like object with "
             "a 'write' attribute")
 
-def save(obj, f, pickle_module = pickle, pickle_protocol = 2, _disable_byteorder_record: bool = False):
+def save(obj, f, pickle_module = pickle, pickle_protocol = 2, _disable_byteorder_record: bool = False,
+         _use_new_zipfile_serialization=False):
     _check_save_filelike(f)
     with _open_zipfile_writer(f) as opened_zipfile:
         _save(
@@ -1341,7 +1303,7 @@ def persistent_id(obj):
         # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
         # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
 
-        if isinstance(obj, MSTensor):
+        if isinstance(obj, Tensor_) and not isinstance(obj, mindspore.Tensor):
             storage_type = storage_map[obj.dtype]
             storage_numel = obj._size
             storage_key = id_map.setdefault(id(obj), str(len(id_map)))
diff --git a/mindnlp/transformers/__init__.py b/mindnlp/transformers/__init__.py
index a5b1c4060..c4c6fc82d 100644
--- a/mindnlp/transformers/__init__.py
+++ b/mindnlp/transformers/__init__.py
@@ -4111,3 +4111,4 @@
     extra_objects={"__version__": transformers.__version__},
 )
 
+transformers.utils.import_utils._torch_fx_available = False
\ No newline at end of file
diff --git a/mindnlp/utils/safetensors_patch.py b/mindnlp/utils/safetensors_patch.py
index 2d32c56e5..cc610521c 100644
--- a/mindnlp/utils/safetensors_patch.py
+++ b/mindnlp/utils/safetensors_patch.py
@@ -99,7 +99,6 @@ def start_offset(self):
         return self.base_ptr + self.info["data_offsets"][0]
 
     def get_shape(self):
-        print('get_shape', self.shape)
         return self.shape
 
     def get_dtype(self):
diff --git a/mindnlp/utils/torch_proxy.py b/mindnlp/utils/torch_proxy.py
index 7309b4d62..cca325d9e 100644
--- a/mindnlp/utils/torch_proxy.py
+++ b/mindnlp/utils/torch_proxy.py
@@ -57,7 +57,7 @@ def initialize_torch_proxy():
     sys.modules["torch"] = torch_proxy
 
     # 设置必要的元数据
-    torch_proxy.__version__ = "2.1.1"
+    torch_proxy.__version__ = "2.1.1+dev"
 
     return torch_proxy
 
@@ -71,9 +71,9 @@ def setup_metadata_patch():
     def patched_distribution(dist_name):
         if dist_name == "torch":
             return types.SimpleNamespace(
-                version="2.1.1",
-                metadata={"Name": "torch", "Version": "2.1.1"},
-                read_text=lambda f: f"Name: torch\nVersion: 2.1.1" if f == "METADATA" else None
+                version="2.1.1+dev",
+                metadata={"Name": "torch", "Version": "2.1.1+dev"},
+                read_text=lambda f: f"Name: torch\nVersion: 2.1.1+dev" if f == "METADATA" else None
             )
         return orig_distribution(dist_name)
     
@@ -82,8 +82,8 @@ def patched_distributions(**kwargs):
         dists = list(orig_distributions(**kwargs))
         dists.append(types.SimpleNamespace(
             name="torch",
-            version="2.1.1",
-            metadata={"Name": "torch", "Version": "2.1.1"},
+            version="2.1.1+dev",
+            metadata={"Name": "torch", "Version": "2.1.1+dev"},
             files=[],
             locate_file=lambda p: None,
             _normalized_name='torch',
diff --git a/tests/peft/test_config.py b/tests/peft/test_config.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/run_test.py b/tests/run_test.py
new file mode 100644
index 000000000..6e2ae52f3
--- /dev/null
+++ b/tests/run_test.py
@@ -0,0 +1,39 @@
+import sys
+import pytest
+import mindspore
+import mindnlp
+from mindnlp import transformers
+
+mindspore.set_context(pynative_synchronize=True)
+
+def run_tests():
+    """
+    使用pytest.main()执行测试，支持所有pytest命令行参数
+    用法: python run_test.py [pytest参数] [测试路径]
+    示例: 
+        python run_test.py -v tests/
+        python run_test.py -k "login" tests/test_auth.py
+        python run_test.py tests/test_api.py::TestLogin::test_invalid_credentials
+    """
+    # 获取命令行参数（排除脚本名本身）
+    pytest_args = sys.argv[1:]
+    
+    if not pytest_args:
+        print("未提供参数，默认运行当前目录下所有测试")
+        print("使用示例: python run_test.py -v tests/")
+    
+    # 执行测试并获取退出码
+    exit_code = pytest.main(pytest_args)
+    
+    # 根据退出码处理结果
+    if exit_code == 0:
+        print("\n✅ 所有测试通过!")
+    else:
+        print(f"\n❌ 测试失败，退出码: {exit_code}")
+        print("常见退出码: 0=通过, 1=失败, 2=中断, 3=内部错误, 4=命令行错误")
+    
+    return exit_code
+
+if __name__ == "__main__":
+    # 执行并返回系统退出码
+    sys.exit(run_tests())
\ No newline at end of file
diff --git a/tests/transformers/__init__.py b/tests/transformers/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/generation/__init__.py b/tests/transformers/generation/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/generation/test_framework_agnostic.py b/tests/transformers/generation/test_framework_agnostic.py
deleted file mode 100644
index bcc3de7f0..000000000
--- a/tests/transformers/generation/test_framework_agnostic.py
+++ /dev/null
@@ -1,605 +0,0 @@
-# pylint: disable=missing-function-docstring
-# pylint: disable=not-callable
-"""
-Framework agnostic tests for generate()-related methods.
-"""
-
-import numpy as np
-
-from mindnlp.transformers import AutoTokenizer
-from mindnlp.utils.testing_utils import slow
-
-
-class GenerationIntegrationTestsMixin:
-    # To be populated by the child classes
-    framework_dependent_parameters = {
-        "AutoModelForCausalLM": None,
-        "AutoModelForSpeechSeq2Seq": None,
-        "AutoModelForSeq2SeqLM": None,
-        "AutoModelForVision2Seq": None,
-        "LogitsProcessorList": None,
-        "MinLengthLogitsProcessor": None,
-        "create_tensor_fn": None,
-        "floats_tensor": None,
-        "return_tensors": None,
-        "set_seed": None,
-    }
-
-    def test_validate_generation_inputs(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-t5")
-
-        encoder_input_str = "Hello world"
-        input_ids = tokenizer(encoder_input_str, return_tensors=return_tensors).input_ids
-
-        # typos are quickly detected (the correct argument is `do_sample`)
-        with self.assertRaisesRegex(ValueError, "do_samples"):
-            model.generate(input_ids, do_samples=True)
-
-        # arbitrary arguments that will not be used anywhere are also not accepted
-        with self.assertRaisesRegex(ValueError, "foo"):
-            fake_model_kwargs = {"foo": "bar"}
-            model.generate(input_ids, **fake_model_kwargs)
-
-        # however, valid model_kwargs are accepted
-        valid_model_kwargs = {"attention_mask": create_tensor_fn(np.zeros_like(input_ids))}
-        model.generate(input_ids, **valid_model_kwargs)
-
-    def test_custom_logits_processor(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        logits_processor_list_cls = self.framework_dependent_parameters["LogitsProcessorList"]
-        min_length_logits_processor_cls = self.framework_dependent_parameters["MinLengthLogitsProcessor"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", min_length=1)
-        input_ids = bart_tokenizer(article, return_tensors=return_tensors).input_ids
-
-        logits_processor = logits_processor_list_cls()
-        logits_processor.append(min_length_logits_processor_cls(min_length=10, eos_token_id=0))
-        # it should not be allowed to both define `min_length` via config and `logits_processor` list
-        with self.assertRaises(ValueError):
-            bart_model.generate(input_ids, logits_processor=logits_processor)
-
-        bart_model.config.min_length = None
-        bart_model.generate(input_ids, logits_processor=logits_processor)
-
-    def test_max_new_tokens_encoder_decoder(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        bart_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart")
-        input_ids = bart_tokenizer(article, return_tensors=return_tensors).input_ids
-
-        self.assertEqual(list(input_ids.shape), [1, 29])
-
-        max_new_tokens = 3
-        bart_model.config.max_length = 20
-        bart_model.config.eos_token_id = None
-
-        # Encoder decoder call
-        outputs = bart_model.generate(input_ids, max_new_tokens=max_new_tokens)
-        # 1 BOS + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 4])
-
-        # Decoder only call
-        outputs = bart_model.generate(decoder_input_ids=input_ids, max_new_tokens=max_new_tokens)
-        # 1 BOS + 29 (input length) + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 33])
-
-        # Encoder decoder call > 20
-        outputs = bart_model.generate(max_new_tokens=max_new_tokens + 20)
-
-        # 1 BOS + 20 + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 24])
-
-    def test_max_new_tokens_decoder_only(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """Justin Timberlake."""
-        gpt2_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-
-        gpt2_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        input_ids = gpt2_tokenizer(article, return_tensors=return_tensors).input_ids
-
-
-        self.assertEqual(list(input_ids.shape), [1, 9])
-
-        max_new_tokens = 3
-        gpt2_model.config.max_length = 20
-
-        # call < 20
-        outputs = gpt2_model.generate(input_ids, max_new_tokens=max_new_tokens)
-
-        # 9 input_ids + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 12])
-
-        # call > 20
-        outputs = gpt2_model.generate(max_new_tokens=max_new_tokens + 20)
-
-        # 1 BOS token + 23 new tokens
-        self.assertEqual(list(outputs.shape), [1, 24])
-
-    def test_encoder_decoder_generate_with_inputs_embeds(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=5)
-        model.config.eos_token_id = None
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-
-        inputs_embeds = model.get_input_embeddings()(input_ids)
-
-        output_sequences = model.generate(inputs_embeds=inputs_embeds)
-
-        # make sure model generated correctly until `max_length`
-        self.assertEqual(output_sequences.shape, (1, 5))
-
-    def test_transition_scores_greedy_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        articles = ["Justin Timberlake", "Michael Phelps"]
-        tokenizer = AutoTokenizer.from_pretrained("distilgpt2", padding_side="left")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = model_cls.from_pretrained("distilgpt2")
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            max_new_tokens=5,
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-        )
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-
-        expected_scores = np.array(
-            [
-                [-57.8844, -60.45698, -70.16364, -65.50791, -66.35648],
-                [-54.417572, -60.216614, -62.661243, -58.621933, -58.298683],
-            ]
-        )
-        self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
-
-    def test_transition_scores_greedy_search_normalized(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        articles = ["Justin Timberlake", "Michael Phelps"]
-        tokenizer = AutoTokenizer.from_pretrained("distilgpt2", padding_side="left")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = model_cls.from_pretrained("distilgpt2")
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            max_new_tokens=5,
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-        )
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-
-        expected_scores = np.array(
-            [
-                [-2.538938, -2.2694316, -2.1580915, -1.572299, -2.6719835],
-                [-1.8826028, -2.2461371, -1.7556462, -2.9644494, -1.7996008],
-            ]
-        )
-        self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
-
-    def test_transition_scores_beam_search_encoder_decoder(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_search_encoder_decoder_with_eos(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_search_decoder_only(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        articles = [
-            "Justin Timberlake",
-            "Michael Phelps",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-gpt2",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_sample_encoder_decoder(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            do_sample=True,
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    @slow
-    def test_transition_scores_early_stopping(self):
-        # This is an aggressive test that makes sure that `beam_search's`
-        # transition scores are computed correctly for varying `num_return_sequences`, `num_beams` and `batch_size > 1`
-        # 2 x input_ids for "question: How are you? \n context: I had a long day, "
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-
-        input_ids = create_tensor_fn(2 * [[822, 10, 571, 33, 25, 58, 2625, 10, 27, 141, 3, 9, 307, 239, 6, 1]])
-        model = model_cls.from_pretrained("t5-small")
-
-        outputs = model.generate(
-            input_ids,
-            max_length=10,
-            return_dict_in_generate=True,
-            output_scores=True,
-            forced_eos_token_id=model.config.eos_token_id,
-            num_beams=4,
-            do_sample=False,
-            num_return_sequences=3,
-            length_penalty=0.0,
-        )
-
-        transition_scores = model.compute_transition_scores(
-            sequences=outputs.sequences, scores=outputs.scores, beam_indices=outputs.beam_indices
-        )
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores))
-
-    def test_encoder_decoder_generate_attention_mask(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        articles = ["Timberlake", "Jessica Biel, welcome to parenthood among other things"]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        # need extreme generation values here to force this test
-        # to fail when `attention_mask` is not correctly treated in generate
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart", max_length=50, num_beams=5, num_return_sequences=5
-        )
-        model.config.eos_token_id = None
-        input_ids = tokenizer(articles[0], return_tensors=return_tensors).input_ids
-        input_ids_batched = tokenizer(articles, padding=True, return_tensors=return_tensors).input_ids
-
-        output_sequences_batched = model.generate(
-            input_ids=input_ids_batched, return_dict_in_generate=True, output_scores=True
-        )
-        output_sequences = model.generate(input_ids=input_ids, return_dict_in_generate=True, output_scores=True)
-
-        batched_out = output_sequences_batched.sequences_scores
-        out = output_sequences.sequences_scores
-        if is_pt:
-            batched_out = batched_out.cpu().numpy()
-            out = out.cpu().numpy()
-
-        diff = np.abs(np.sum(batched_out[:5]) - np.sum(out))
-        self.assertTrue(diff < 1e-4)
-
-    def test_generate_input_ids_as_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """I need input_ids to generate"""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=15)
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-
-        output_sequences_kwargs = model.generate(input_ids=input_ids)
-        output_sequences = model.generate(input_ids)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (1, 15))
-
-    def test_generate_input_ids_as_encoder_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=5)
-        model.config.eos_token_id = None
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-
-        output_sequences_kwargs = model.generate(input_ids=input_ids)
-        output_sequences = model.generate(input_ids)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (1, 5))
-
-    def test_generate_inputs_and_encoder_kwargs(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """I need input_ids to generate"""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=10)
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-        with self.assertRaises(ValueError):
-            model.generate(input_ids, input_ids=input_ids)
-
-    def test_generate_too_many_encoder_kwargs(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """I need input_ids to generate"""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=10)
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-        with self.assertRaises(ValueError):
-            model.generate(input_ids=input_ids, inputs_embeds=input_ids)
-
-    def test_generate_input_features_as_encoder_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSpeechSeq2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-
-        input_features = floats_tensor((3, 80, 60))
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-WhisperForConditionalGeneration")
-
-        output_sequences_kwargs = model.generate(input_features=input_features, max_length=5)
-        output_sequences = model.generate(input_features, max_length=5)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (3, 5))
-
-    def test_generate_pixel_values_as_encoder_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForVision2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-
-        pixel_values = floats_tensor((2, 3, 30, 30))
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2")
-        model.config.decoder.eos_token_id = None
-
-        output_sequences_kwargs = model.generate(pixel_values=pixel_values, max_length=5)
-        output_sequences = model.generate(pixel_values, max_length=5)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (2, 5))
-
-    def test_generate_encoder_outputs_attention_mask(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSpeechSeq2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-
-        input_features = floats_tensor((3, 80, 60))
-        attention_mask = create_tensor_fn(np.ones(input_features.shape))
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-WhisperForConditionalGeneration")
-
-        encoder = model.get_encoder()
-        encoder_outputs = encoder(input_features)
-
-        output_sequences_no_mask = model.generate(encoder_outputs=encoder_outputs)
-        output_sequences_with_mask = model.generate(encoder_outputs=encoder_outputs, attention_mask=attention_mask)
-
-        self.assertTrue(np.array_equal(output_sequences_no_mask, output_sequences_with_mask))
-
-    def test_eos_token_id_int_and_list_greedy_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 1,
-        }
-        expectation = 13
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors=return_tensors)
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-
-        eos_token_id = 873
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-        eos_token_id = [873, 198]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-    def test_eos_token_id_int_and_list_contrastive_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 1,
-            "penalty_alpha": 0.6,
-            "top_k": 4,
-        }
-        expectation = 17
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors=return_tensors)
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-
-        eos_token_id = 225
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-        eos_token_id = [225, 198]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-    def test_eos_token_id_int_and_list_beam_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 3,
-        }
-        expectation = 13
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors=return_tensors)
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-
-        eos_token_id = 873
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        unpadded_correct_condition = expectation == len(generated_tokens[0])
-        padded_correct_condition = expectation < len(generated_tokens[0]) and all(
-            token == model.config.pad_token_id for token in generated_tokens[0][expectation:]
-        )
-        self.assertTrue(unpadded_correct_condition or padded_correct_condition)
-
-        eos_token_id = [873, 198]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        unpadded_correct_condition = expectation == len(generated_tokens[0])
-        padded_correct_condition = expectation < len(generated_tokens[0]) and all(
-            token == model.config.pad_token_id for token in generated_tokens[0][expectation:]
-        )
-        self.assertTrue(unpadded_correct_condition or padded_correct_condition)
-
-    def test_generate_vision2text_conditioning(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForVision2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-
-        pixel_values = floats_tensor((2, 3, 30, 30))
-        conditioning_input = create_tensor_fn([[10], [10]])  # this should be the 2nd output token, after the BOS token
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2")
-
-        # we can condition on decoder_input_ids (expected decoder input) and input_ids (which we pipe internally as
-        # decoder_input_ids, if the encoder is not a model with text input)
-        output_sequences_decoder_input_ids = model.generate(
-            pixel_values, max_length=5, decoder_input_ids=conditioning_input
-        )
-        output_sequences_input_ids = model.generate(pixel_values, max_length=5, input_ids=conditioning_input)
-
-        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids, output_sequences_input_ids))
-        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids[:, 1:2], conditioning_input))
diff --git a/tests/transformers/generation/test_utils.py b/tests/transformers/generation/test_utils.py
deleted file mode 100644
index 35b486d6a..000000000
--- a/tests/transformers/generation/test_utils.py
+++ /dev/null
@@ -1,3289 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import copy
-import inspect
-import pytest
-import unittest
-import warnings
-
-import numpy as np
-from parameterized import parameterized
-
-from mindnlp.engine import set_seed
-from mindnlp.transformers import pipeline
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    is_flaky,
-    require_mindspore,
-    slow,
-)
-
-from ..test_modeling_common import floats_tensor, ids_tensor
-from .test_framework_agnostic import GenerationIntegrationTestsMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        AutoModelForCausalLM,
-        AutoModelForSeq2SeqLM,
-        AutoModelForSpeechSeq2Seq,
-        AutoModelForVision2Seq,
-        AutoProcessor,
-        AutoTokenizer,
-        BartForCausalLM,
-        BartForConditionalGeneration,
-        BartTokenizer,
-        GPT2LMHeadModel,
-        GPT2Tokenizer,
-        ImageGPTForCausalImageModeling,
-        SpeechEncoderDecoderModel,
-    )
-    from mindnlp.transformers.cache_utils import DynamicCache, EncoderDecoderCache
-    from mindnlp.transformers.generation import (
-        BeamSampleDecoderOnlyOutput,
-        BeamSampleEncoderDecoderOutput,
-        BeamSearchDecoderOnlyOutput,
-        BeamSearchEncoderDecoderOutput,
-        DisjunctiveConstraint,
-        GenerateBeamDecoderOnlyOutput,
-        GenerateBeamEncoderDecoderOutput,
-        GenerateDecoderOnlyOutput,
-        GenerateEncoderDecoderOutput,
-        GenerationConfig,
-        GreedySearchDecoderOnlyOutput,
-        GreedySearchEncoderDecoderOutput,
-        LogitsProcessorList,
-        MaxLengthCriteria,
-        MinLengthLogitsProcessor,
-        PhrasalConstraint,
-        PromptLookupCandidateGenerator,
-        SampleDecoderOnlyOutput,
-        SampleEncoderDecoderOutput,
-        StoppingCriteria,
-        StoppingCriteriaList,
-        WatermarkDetector,
-        WatermarkingConfig,
-    )
-    from mindnlp.transformers.generation.utils import _speculative_sampling
-
-
-class GenerationTesterMixin:
-    model_tester = None
-    all_generative_model_classes = ()
-    input_name = "input_ids"
-    max_new_tokens = 3
-
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        # TODO: @raushan or @gante, use `model.main_input_name` as the main input instead of relyinn on `input_ids`
-        input_ids = inputs_dict.pop(self.input_name)[:batch_size, :]
-        inputs_dict.pop("attention_mask", None)
-
-        # we don't want encoder-decoder models to start from filled decoder ids
-        inputs_dict.pop("decoder_input_ids", None)
-        inputs_dict.pop("decoder_attention_mask", None)
-
-        # we'll set cache use in each test differently
-        inputs_dict.pop("use_cache", None)
-
-        inputs_dict = {
-            k: v[:batch_size, ...]
-            for k, v in inputs_dict.items()
-            if "head_mask" not in k and isinstance(v, mindspore.Tensor)
-        }
-        if config.eos_token_id is not None and config.pad_token_id is None:
-            # hack to allow generate for models such as GPT2 as is done in `generate()`
-            if isinstance(config.eos_token_id, int):
-                config.eos_token_id = [config.eos_token_id]
-            config.pad_token_id = config.eos_token_id[0]
-
-        if self.has_attentions:
-            attention_mask = ops.ones_like(input_ids, dtype=mindspore.int64)
-        else:
-            attention_mask = None
-
-        # It is important set set the eos_token_id to None to ensure that no sequences
-        # shorter than `max_length` can be generated
-        config.eos_token_id = None
-        config.forced_eos_token_id = None
-
-        return config, input_ids, attention_mask, inputs_dict
-
-    def _get_logits_processor_kwargs(self, do_sample=False, config=None):
-        logits_processor_kwargs = {
-            "bad_words_ids": [[1, 0]],
-            "repetition_penalty": 1.2,
-            "remove_invalid_values": True,
-        }
-        if do_sample:
-            logits_processor_kwargs.update(
-                {
-                    "top_k": 10,
-                    "top_p": 0.7,
-                    "temperature": 0.7,
-                }
-            )
-        # TODO (joao, raushan): see this comment for a long-term fix
-        # https://github.com/huggingface/transformers/pull/33593#issuecomment-2361824264)
-        # This is a band-aid for VLM models, to ensure they don't generate image/video tokens which would cause them
-        # to crash. On pretrained models this isn't a risk, as they are trained to not generate these tokens.
-        if config is not None:
-            image_token_index = config.image_token_index if hasattr(config, "image_token_index") else None
-            video_token_index = config.video_token_index if hasattr(config, "video_token_index") else None
-            if image_token_index is not None and image_token_index < config.get_text_config().vocab_size:
-                logits_processor_kwargs["bad_words_ids"].append([image_token_index])
-            if video_token_index is not None and video_token_index < config.get_text_config().vocab_size:
-                logits_processor_kwargs["bad_words_ids"].append([video_token_index])
-
-        return logits_processor_kwargs
-
-    @staticmethod
-    def _get_logits_processor_and_warper_kwargs(
-        input_length,
-        forced_bos_token_id=None,
-        forced_eos_token_id=None,
-    ):
-        process_kwargs = {
-            "bad_words_ids": [[1, 0]],
-            "repetition_penalty": 1.2,
-            "remove_invalid_values": True,
-        }
-        # NoRepeatNGramLogitsProcessor + forced tokens may result in no valid continuations
-        if forced_bos_token_id is None and forced_eos_token_id is None:
-            process_kwargs["no_repeat_ngram_size"] = 2
-
-        warp_kwargs = {"top_k": 10, "top_p": 0.7, "temperature": 0.7}
-        return process_kwargs, warp_kwargs
-
-    @staticmethod
-    def _get_beam_kwargs(num_return_sequences=1):
-        beam_kwargs = {
-            "early_stopping": False,
-            "length_penalty": 2.0,
-            "num_beams": 2,
-            "num_return_sequences": num_return_sequences,
-        }
-        return beam_kwargs
-
-    @staticmethod
-    def _get_diverse_beam_kwargs(num_return_sequences=1):
-        beam_kwargs = {
-            "early_stopping": False,
-            "length_penalty": 2.0,
-            "num_beams": 2,
-            "num_return_sequences": num_return_sequences,
-            "num_beam_groups": 2,  # one beam per group
-            "diversity_penalty": 2.0,
-        }
-        return beam_kwargs
-
-    @staticmethod
-    def _get_constrained_beam_kwargs(num_return_sequences=1):
-        beam_kwargs = {
-            "early_stopping": False,
-            "length_penalty": 2.0,
-            "num_beams": num_return_sequences * 4,
-            "num_return_sequences": num_return_sequences,
-        }
-        return beam_kwargs
-
-    @staticmethod
-    def _get_encoder_outputs(
-        model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
-    ):
-        encoder = model.get_encoder()
-        encoder_outputs = encoder(
-            input_ids,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-        )
-        encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave(
-            num_interleave, dim=0
-        )
-        generation_config = copy.deepcopy(model.generation_config)
-        model._prepare_special_tokens(generation_config)
-        input_ids = ops.zeros_like(input_ids[:, :1]) + generation_config.decoder_start_token_id
-        attention_mask = None
-        return encoder_outputs, input_ids, attention_mask
-
-    def _greedy_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        inputs_dict,
-        output_scores=False,
-        output_logits=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-        use_cache=True,
-    ):
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=False,
-            num_beams=1,
-            max_new_tokens=self.max_new_tokens,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            output_scores=output_scores,
-            output_logits=output_logits,
-            return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
-            **logits_processor_kwargs,
-            **model_kwargs,
-            **inputs_dict,
-        )
-
-        return output_generate
-
-    def _sample_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        inputs_dict,
-        num_return_sequences,
-        output_scores=False,
-        output_logits=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-        use_cache=True,
-    ):
-        set_seed(0)
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=True, config=model.config)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=True,
-            num_beams=1,
-            max_new_tokens=self.max_new_tokens,
-            num_return_sequences=num_return_sequences,
-            output_scores=output_scores,
-            output_logits=output_logits,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
-            **logits_processor_kwargs,
-            **model_kwargs,
-            **inputs_dict,
-        )
-
-        return output_generate
-
-    def _beam_search_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        inputs_dict,
-        beam_kwargs,
-        output_scores=False,
-        output_logits=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-        use_cache=True,
-    ):
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=False,
-            max_new_tokens=self.max_new_tokens,
-            output_scores=output_scores,
-            output_logits=output_logits,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
-            **beam_kwargs,
-            **logits_processor_kwargs,
-            **model_kwargs,
-            **inputs_dict,
-        )
-
-        return output_generate
-
-    def _beam_sample_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        inputs_dict,
-        beam_kwargs,
-        output_scores=False,
-        output_logits=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-        use_cache=True,
-    ):
-        set_seed(123)
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=True, config=model.config)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=True,
-            max_new_tokens=self.max_new_tokens,
-            output_scores=output_scores,
-            output_logits=output_logits,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
-            **beam_kwargs,
-            **logits_processor_kwargs,
-            **model_kwargs,
-            **inputs_dict,
-        )
-
-        return output_generate
-
-    def _group_beam_search_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        inputs_dict,
-        beam_kwargs,
-        output_scores=False,
-        output_logits=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-        use_cache=True,
-    ):
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=False,
-            max_new_tokens=self.max_new_tokens,
-            output_scores=output_scores,
-            output_logits=output_logits,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
-            **beam_kwargs,
-            **logits_processor_kwargs,
-            **model_kwargs,
-            **inputs_dict,
-        )
-
-        return output_generate
-
-    def _constrained_beam_search_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        inputs_dict,
-        constraints,
-        beam_kwargs,
-        output_scores=False,
-        output_logits=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-        use_cache=True,
-    ):
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=False,
-            max_new_tokens=self.max_new_tokens,
-            output_scores=output_scores,
-            output_logits=output_logits,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            constraints=constraints,
-            use_cache=use_cache,
-            **beam_kwargs,
-            **logits_processor_kwargs,
-            **model_kwargs,
-            **inputs_dict,
-        )
-
-        return output_generate
-
-    def _contrastive_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        inputs_dict,
-        output_scores=False,
-        output_logits=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-        use_cache=True,
-    ):
-        contrastive_search_kwargs = {
-            "penalty_alpha": 0.6,
-            "top_k": 5,
-        }
-
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=False,
-            num_beams=1,
-            max_new_tokens=self.max_new_tokens,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            output_scores=output_scores,
-            output_logits=output_logits,
-            return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
-            **logits_processor_kwargs,
-            **model_kwargs,
-            **contrastive_search_kwargs,
-            **inputs_dict,
-        )
-
-        return output_generate
-
-    @pytest.mark.generate
-    def test_greedy_generate(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).eval()
-            output_generate = self._greedy_generate(
-                model=model, input_ids=input_ids, attention_mask=attention_mask, inputs_dict=inputs_dict
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
-    @pytest.mark.generate
-    def test_greedy_generate_dict_outputs(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                output_scores=True,
-                output_logits=True,
-                output_hidden_states=True,
-                output_attentions=self.has_attentions,
-                return_dict_in_generate=True,
-                use_cache=False,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
-                self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, GreedySearchEncoderDecoderOutput)
-            else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-                self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput)
-
-            self._check_outputs(output_generate, input_ids, model.config)
-
-    @pytest.mark.generate
-    def test_greedy_generate_dict_outputs_use_cache(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-            if any(model_name in model_class.__name__.lower() for model_name in ["rwkv"]):
-                self.skipTest(reason="Won't fix: model with non-standard dictionary output shapes")
-
-            config.is_decoder = True
-            model = model_class(config).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                output_scores=True,
-                output_logits=True,
-                output_hidden_states=True,
-                output_attentions=self.has_attentions,
-                return_dict_in_generate=True,
-                use_cache=True,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
-            self._check_outputs(output_generate, input_ids, model.config, use_cache=True)
-
-
-    @pytest.mark.generate
-    def test_sample_generate(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).eval()
-            output_generate = self._sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                num_return_sequences=1,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
-    @pytest.mark.generate
-    def test_sample_generate_dict_output(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).eval()
-            output_generate = self._sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                num_return_sequences=2,
-                output_scores=True,
-                output_logits=True,
-                output_hidden_states=True,
-                output_attentions=self.has_attentions,
-                return_dict_in_generate=True,
-                use_cache=False,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
-                self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, SampleEncoderDecoderOutput)
-            else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-                self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, SampleDecoderOnlyOutput)
-
-            self._check_outputs(output_generate, input_ids, model.config, num_return_sequences=2)
-
-    @pytest.mark.generate
-    def test_beam_search_generate(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).eval()
-
-            beam_kwargs = self._get_beam_kwargs()
-            output_generate = self._beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
-    @pytest.mark.generate
-    def test_beam_search_generate_dict_output(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).eval()
-            beam_kwargs = self._get_beam_kwargs()
-            output_generate = self._beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-                output_scores=True,
-                output_logits=True,
-                output_hidden_states=True,
-                output_attentions=self.has_attentions,
-                return_dict_in_generate=True,
-                use_cache=False,
-            )
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
-                self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
-            else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-                self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
-
-            self._check_outputs(
-                output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
-            )
-
-    @pytest.mark.generate
-    def test_beam_search_generate_dict_outputs_use_cache(self):
-        for model_class in self.all_generative_model_classes:
-            # enable cache
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-            if any(model_name in model_class.__name__.lower() for model_name in ["rwkv"]):
-                self.skipTest(reason="Won't fix: model with non-standard dictionary output shapes")
-
-            model = model_class(config).eval()
-            beam_kwargs = self._get_beam_kwargs()
-
-            config.is_decoder = True
-            model = model_class(config).eval()
-            output_generate = self._beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-                output_scores=True,
-                output_logits=True,
-                output_hidden_states=True,
-                output_attentions=self.has_attentions,
-                return_dict_in_generate=True,
-                use_cache=True,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
-            self._check_outputs(
-                output_generate, input_ids, model.config, use_cache=True, num_return_sequences=beam_kwargs["num_beams"]
-            )
-
-    @pytest.mark.generate
-    def test_beam_sample_generate(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-            model = model_class(config).eval()
-            beam_kwargs = self._get_beam_kwargs()
-            output_generate = self._beam_sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
-            # for VLMs inputs embeds won't match input ids unless images are encoded and merged with ids properly
-            # no quick fix available, since obtaining image embeddings step is very model-specific
-            if any(name in model.__class__.__name__.lower() for name in ("blip", "llava", "paligemma")):
-                prepare_inputs_for_generation_args = set(
-                    inspect.signature(model.prepare_inputs_for_generation).parameters
-                )
-                # `inputs_embeds` input is well supported when `cache_positions` is used, because it means the modeling
-                # code is up to date with our most recent standards
-                if (
-                    "inputs_embeds" in prepare_inputs_for_generation_args
-                    and "cache_positions" in prepare_inputs_for_generation_args
-                ):
-                    input_embeds = model.get_input_embeddings()(input_ids)
-                    beam_kwargs.update({"inputs_embeds": input_embeds})
-                    output_generate2 = self._beam_sample_generate(
-                        model=model,
-                        input_ids=None,
-                        attention_mask=attention_mask,
-                        inputs_dict={},
-                        beam_kwargs=beam_kwargs,
-                    )
-                    assert ops.allclose(output_generate[:, input_embeds.shape[1] :], output_generate2)
-
-    @pytest.mark.generate
-    def test_beam_sample_generate_dict_output(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).eval()
-            beam_kwargs = self._get_beam_kwargs()
-
-            output_generate = self._beam_sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-                output_scores=True,
-                output_logits=True,
-                output_hidden_states=True,
-                output_attentions=self.has_attentions,
-                return_dict_in_generate=True,
-                use_cache=False,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
-                self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, BeamSampleEncoderDecoderOutput)
-            else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-                self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput)
-
-            self._check_outputs(
-                output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
-            )
-
-    def test_generate_without_input_ids(self):
-        config, _, _, _ = self._get_input_ids_and_config()
-
-        # if no bos token id => cannot generate from None
-        if config.bos_token_id is None:
-            self.skipTest(reason="bos_token_id is None")
-
-        # hack in case they are equal, otherwise the attn mask will be [0]
-        if config.bos_token_id == config.pad_token_id:
-            config.pad_token_id = None
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            output_ids_generate = model.generate(
-                do_sample=False, max_new_tokens=self.max_new_tokens, remove_invalid_values=True
-            )
-            self.assertIsNotNone(output_ids_generate)
-
-    @pytest.mark.generate
-    def test_group_beam_search_generate(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).eval()
-            # check `generate()` and `group_beam_search()` are equal
-            beam_kwargs = self._get_diverse_beam_kwargs()
-            output_generate = self._group_beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-            )
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
-            # check `group_beam_search` for higher than 1 `num_return_sequences`
-            num_return_sequences = 2
-            beam_kwargs = self._get_diverse_beam_kwargs(num_return_sequences=num_return_sequences)
-            output_generate = self._group_beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-            )
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
-    @pytest.mark.generate
-    def test_group_beam_search_generate_dict_output(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).eval()
-            beam_kwargs = self._get_diverse_beam_kwargs()
-            output_generate = self._group_beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-                output_scores=True,
-                output_logits=True,
-                output_hidden_states=True,
-                output_attentions=self.has_attentions,
-                return_dict_in_generate=True,
-                use_cache=False,
-            )
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
-                self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
-            else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-                self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
-
-            self._check_outputs(
-                output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
-            )
-
-    @is_flaky()
-    @pytest.mark.generate
-    def test_constrained_beam_search_generate(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).eval()
-
-            # Sample constraints
-            min_id = 3
-            max_id = config.get_text_config(decoder=True).vocab_size
-
-            force_tokens = ops.randint(min_id, max_id, (1, 2)).tolist()[0]
-            constraints = [
-                PhrasalConstraint(force_tokens),
-            ]
-
-            beam_kwargs = self._get_constrained_beam_kwargs()
-            output_generate = self._constrained_beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                constraints=constraints,
-                beam_kwargs=beam_kwargs,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
-            for generation_output in output_generate:
-                self._check_sequence_inside_sequence(force_tokens, generation_output)
-
-            # check`constrained_beam_search` for higher than 1 `num_return_sequences`
-            # Sample constraints
-            force_tokens = ops.randint(min_id, max_id, (1, 2)).tolist()[0]
-            constraints = [
-                PhrasalConstraint(force_tokens),
-            ]
-
-            beam_kwargs = self._get_constrained_beam_kwargs(num_return_sequences=2)
-
-            output_generate = self._constrained_beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                constraints=constraints,
-                beam_kwargs=beam_kwargs,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
-            for generation_output in output_generate:
-                self._check_sequence_inside_sequence(force_tokens, generation_output)
-
-    @pytest.mark.generate
-    def test_constrained_beam_search_generate_dict_output(self):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).eval()
-
-            # Sample constraints
-            min_id = 3
-            max_id = model.config.get_text_config(decoder=True).vocab_size
-            force_tokens = ops.randint(min_id, max_id, (1, 2)).tolist()[0]
-            constraints = [
-                PhrasalConstraint(force_tokens),
-            ]
-
-            beam_kwargs = self._get_constrained_beam_kwargs()
-            output_generate = self._constrained_beam_search_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                constraints=constraints,
-                beam_kwargs=beam_kwargs,
-                output_scores=True,
-                output_logits=True,
-                output_hidden_states=True,
-                output_attentions=self.has_attentions,
-                return_dict_in_generate=True,
-                use_cache=False,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
-                self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
-            else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-                self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
-
-            self._check_outputs(
-                output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
-            )
-
-    @pytest.mark.generate
-    def test_contrastive_generate(self):
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support contrastive search generation")
-
-            # won't fix: FSMT and Reformer have a different cache variable type (and format).
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
-
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            # NOTE: contrastive search only works with cache on at the moment.
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-            config.is_decoder = True
-
-            # test old generation output for backwards compatibility
-            model = model_class(config).eval()
-            output_generate = self._contrastive_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                use_cache=True,
-            )
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
-    @pytest.mark.generate
-    def test_contrastive_generate_dict_outputs_use_cache(self):
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support contrastive search generation")
-
-            # won't fix: FSMT and Reformer have a different cache variable type (and format).
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
-
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            # NOTE: contrastive search only works with cache on at the moment.
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-            config.is_decoder = True
-
-            model = model_class(config).eval()
-            output_generate = self._contrastive_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_dict=inputs_dict,
-                output_scores=True,
-                output_logits=True,
-                output_hidden_states=True,
-                output_attentions=self.has_attentions,
-                return_dict_in_generate=True,
-                use_cache=True,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
-
-            self._check_outputs(output_generate, input_ids, model.config, use_cache=True)
-
-    @pytest.mark.generate
-    def test_contrastive_generate_low_memory(self):
-        # Check that choosing 'low_memory' does not change the model output
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support contrastive search generation")
-
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer", "speech2text"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
-            if any(model_name in model_class.__name__.lower() for model_name in ["gptbigcode"]):
-                self.skipTest(reason="TODO: fix me")
-
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
-
-            # NOTE: contrastive search only works with cache on at the moment.
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-
-            config.is_decoder = True
-
-            # test output equality of low versus high memory
-            model = model_class(config).eval()
-
-            low_output = model.generate(
-                input_ids,
-                top_k=4,
-                penalty_alpha=0.6,
-                low_memory=True,
-                max_new_tokens=self.max_new_tokens,
-                attention_mask=attention_mask,
-                **inputs_dict,
-                use_cache=True,
-            )
-
-            high_output = model.generate(
-                input_ids,
-                top_k=4,
-                penalty_alpha=0.6,
-                low_memory=False,
-                max_new_tokens=self.max_new_tokens,
-                attention_mask=attention_mask,
-                **inputs_dict,
-                use_cache=True,
-            )
-            self.assertListEqual(low_output.tolist(), high_output.tolist())
-
-    def test_beam_search_low_memory(self):
-        # Check that choosing 'low_memory' does not change the model output
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="May fix in the future: need custom cache handling")
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
-            if any(
-                model_name in model_class.__name__.lower()
-                for model_name in [
-                    "ctrl",
-                    "gptbigcode",
-                    "transo_xl",
-                    "xlnet",
-                    "cpm",
-                    "jamba",
-                ]
-            ):
-                self.skipTest(reason="May fix in the future: need model-specific fixes")
-            config, input_ids, _, _ = self._get_input_ids_and_config(batch_size=2)
-            # batch_size=1 is ok, but batch_size>1 will cause non-identical output
-
-            config.use_cache = True
-            config.is_decoder = True
-
-            # test output equality of low versus high memory
-            model = model_class(config).eval()
-
-            low_output = model.generate(input_ids, max_new_tokens=8, num_beams=5, early_stopping=True, low_memory=True)
-
-            high_output = model.generate(
-                input_ids, max_new_tokens=8, num_beams=5, early_stopping=True, low_memory=False
-            )
-            self.assertListEqual(low_output.tolist(), high_output.tolist())
-
-    @pytest.mark.generate
-    @parameterized.expand([("random",), ("same",)])
-    @is_flaky()  # Read NOTE (1) below. If there are API issues, all attempts will fail.
-    def test_assisted_decoding_matches_greedy_search(self, assistant_type):
-        # This test ensures that the assisted generation does not introduce output changes over greedy search.
-        # NOTE (1): The sentence above is true most of the time, there is a tiny difference in the logits due to matmul
-        # shape differences -- and it may result in a different output. The input shape difference happens in the
-        # main model, that runs the forward pass with several candidates at once (as opposed to generating one token at
-        # a time). See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 for more info.
-        # NOTE (2): It breaks the pattern in the tests above, for multiple reasons:
-        # - assisted_decoding, contrarily to the other methods, can't be called on its own (e.g. needs to
-        # prepare the assistant encoder outputs in the main generate body);
-        # - assisted_decoding does not support `use_cache = False`
-        # - assisted_decoding does not support `batch_size > 1`
-
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support assisted generation")
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
-            if any(
-                model_name in model_class.__name__.lower()
-                for model_name in [
-                    "bigbirdpegasus",
-                    "led",
-                    "mega",
-                    "speech2text",
-                    "git",
-                    "prophetnet",
-                    "seamlessm4t",
-                    "clvp",
-                ]
-            ):
-                self.skipTest(reason="May fix in the future: need model-specific fixes")
-
-            # enable cache
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
-
-            # NOTE: assisted generation only works with cache on at the moment.
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-
-            config.is_decoder = True
-            model = model_class(config).eval()
-            # Sets assisted generation arguments such that:
-            # a) no EOS is generated, to ensure generation doesn't break early
-            # b) the assistant model always generates two tokens when it is called, to ensure the input preparation of
-            #    the assistant model is correct
-            # c) there are at least two forward passes in the main model, to ensure the input preparation of
-            #    the main model is correct
-            generation_kwargs = {
-                "eos_token_id": -1,  # see a)
-                "max_new_tokens": 4,  # see c)
-                "num_beams": 1,
-                "do_sample": False,
-                "output_scores": True,
-                "output_logits": True,
-                "output_hidden_states": True,
-                "output_attentions": self.has_attentions,
-                "return_dict_in_generate": True,
-                "use_cache": True,
-            }
-            output_greedy = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
-
-            # test with the same assistant model or randomly init one
-            # in the first case all candidate tokens are accepted, in the second none is accepted
-            # case when some are accepted and some not is hard to reproduce, so let's hope this catches most errors :)
-            if assistant_type == "random":
-                assistant_model = model_class(config).eval()
-            else:
-                assistant_model = model
-            assistant_model.generation_config.num_assistant_tokens = 2  # see b)
-            assistant_model.generation_config.num_assistant_tokens_schedule = "constant"  # see b)
-            generation_kwargs.update({"assistant_model": assistant_model})
-            output_assisted = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
-
-            # The two outputs must match and their shape must be as expected
-
-            self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
-            for output in (output_greedy, output_assisted):
-                self._check_outputs(output, input_ids, model.config, use_cache=True)
-
-    @pytest.mark.skip
-    def test_prompt_lookup_decoding_matches_greedy_search(self):
-        # This test ensures that the prompt lookup generation does not introduce output changes over greedy search.
-        # This test is mostly a copy of test_assisted_decoding_matches_greedy_search
-
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support assisted generation")
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
-            if any(
-                model_name in model_class.__name__.lower()
-                for model_name in [
-                    "bigbirdpegasus",
-                    "led",
-                    "mega",
-                    "speech2text",
-                    "git",
-                    "prophetnet",
-                    "seamlessm4t",
-                    "clvp",
-                ]
-            ):
-                self.skipTest(reason="May fix in the future: need model-specific fixes")
-
-            # enable cache
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
-
-            # NOTE: assisted generation only works with cache on at the moment.
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-
-            config.is_decoder = True
-            model = model_class(config).eval()
-            # Sets assisted generation arguments such that:
-            # a) no EOS is generated, to ensure generation doesn't break early
-            # b) the prompt lookup tries to give the model 2 tokens, to ensure the input preparation of
-            #    prompt lookup is correct
-            # c) there are at least two forward passes in the main model, to ensure the input preparation of
-            #    the main model is correct
-            generation_kwargs = {
-                "eos_token_id": -1,  # see a)
-                "max_new_tokens": 4,  # see c)
-                "num_beams": 1,
-                "do_sample": False,
-                "output_scores": True,
-                "output_logits": True,
-                "output_hidden_states": True,
-                "output_attentions": self.has_attentions,
-                "return_dict_in_generate": True,
-                "use_cache": True,
-            }
-
-            output_greedy = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
-
-            generation_kwargs.update({"prompt_lookup_num_tokens": 2})  # see b)
-            output_prompt_lookup = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
-
-            # The two outputs must match and their shape must be as expected
-
-            self.assertListEqual(output_greedy.sequences.tolist(), output_prompt_lookup.sequences.tolist())
-            for output in (output_greedy, output_prompt_lookup):
-                self._check_outputs(output, input_ids, model.config, use_cache=True)
-
-    @pytest.mark.generate
-    def test_dola_decoding_sample(self):
-        # TODO (joao): investigate skips, try to reduce incompatibilities
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support DoLa decoding")
-
-            if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
-                self.skipTest("Skip Reformer as the lm_head input size is 2 * hidden size, adopted from Rev Nets.")
-
-            if any(model_name in model_class.__name__.lower() for model_name in ["marian", "mbart", "pegasus"]):
-                self.skipTest("DoLa is not supported for models that don't return layerwise hidden states")
-
-            # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            # Encoder-decoder models are not supported
-            if config.is_encoder_decoder:
-                self.skipTest("DoLa is not supported for encoder-decoder models")
-            config.is_decoder = True
-            model = model_class(config).eval()
-
-            if model.get_output_embeddings() is None:
-                self.skipTest("DoLa is not supported for models that don't have output embeddings")
-            # Sets dola generation arguments such that:
-            # a) no EOS is generated, to ensure generation doesn't break early
-            # b) there are at least two forward passes in the main model, to ensure the input preparation of
-            #    the main model is correct
-            generation_kwargs = {
-                "eos_token_id": -1,  # see a)
-                "max_new_tokens": 4,  # see b)
-                "num_beams": 1,
-                "do_sample": True,
-                "output_scores": True,
-                "output_logits": True,
-                "output_hidden_states": True,
-                "output_attentions": self.has_attentions,
-                "return_dict_in_generate": True,
-                "use_cache": hasattr(config, "use_cache"),  # Some models don't support the cache
-            }
-            generation_kwargs.update({"dola_layers": "low"})
-            model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-            output_dola = model.generate(input_ids, **model_kwargs, **generation_kwargs, **inputs_dict)
-            self._check_outputs(output_dola, input_ids, model.config, use_cache=hasattr(config, "use_cache"))
-
-    @pytest.mark.generate
-    def test_assisted_decoding_sample(self):
-        # In this test we don't check assisted vs non-assisted output -- seeded assisted decoding with sample will not
-        # match sample for the same seed, as the forward pass does not return the exact same logits (due to matmul with
-        # different shapes, see https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535).
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support assisted generation")
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
-            if any(
-                model_name in model_class.__name__.lower()
-                for model_name in [
-                    "bigbirdpegasus",
-                    "led",
-                    "mega",
-                    "speech2text",
-                    "git",
-                    "prophetnet",
-                    "seamlessm4t",
-                    "clvp",
-                ]
-            ):
-                self.skipTest(reason="May fix in the future: need model-specific fixes")
-
-            # enable cache
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
-
-            # NOTE: assisted generation only works with cache on at the moment.
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-
-            config.is_decoder = True
-            model = model_class(config).eval()
-            # Sets assisted generation arguments such that:
-            # a) no EOS is generated, to ensure generation doesn't break early
-            # b) the assistant model always generates two tokens when it is called, to ensure the input preparation of
-            #    the assistant model is correct
-            # c) there are at least two forward passes in the main model, to ensure the input preparation of
-            #    the main model is correct
-            assistant_model = model
-            assistant_model.generation_config.num_assistant_tokens = 2  # see b)
-            assistant_model.generation_config.num_assistant_tokens_schedule = "constant"  # see b)
-            generation_kwargs = {
-                "eos_token_id": -1,  # see a)
-                "max_new_tokens": 4,  # see c)
-                "num_beams": 1,
-                "do_sample": True,
-                "assistant_model": assistant_model,
-                "output_scores": True,
-                "output_logits": True,
-                "output_hidden_states": True,
-                "output_attentions": self.has_attentions,
-                "return_dict_in_generate": True,
-                "use_cache": True,
-            }
-            output_assisted = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
-
-            self._check_outputs(output_assisted, input_ids, config, use_cache=True)
-
-    @pytest.mark.generate
-    def test_prompt_lookup_decoding_stops_at_eos(self):
-        # This test ensures that the prompt lookup generation stops at eos token and does not suggest more tokens
-        # (see https://github.com/huggingface/transformers/pull/31301)
-
-        # The main idea is to have an ngram (unigram in our case) that is repeated twice in the input ids.
-        # First time at the very end, so input ends with the unigrams, and second any arbitrary location.
-        # Also, we need an EOS token which will be injected just after the arbitrary located ngram.
-        # We verify that PLD will not copy and propose candidated that contain an EOS token, even if there are overlapping ngrams
-        # in input ids. Otherwise a proposed EOS along with the trailing (ngrams-1) tokens might be accepted by the target model.
-        # That seems as if the model "generated" and EOS but didn't stop from user's perspective
-
-        input_ids = ops.randint(1, 50, (1, 10))  # generate inputs in range from 1-50
-        arbitrary_ngram = 51  # this is the arbitrary ngram, specifically chosen OOV to prevent flaky tests
-        input_ids[:, 3] = arbitrary_ngram  # set pre-eos to arbitrary_ngram which is for sure not present in inputs
-        input_ids[:, -1] = arbitrary_ngram  # put arbitrary_ngram in the end for the necessary match to happen
-
-        eos_token_id = mindspore.tensor([0])
-        input_ids[:, 4] = eos_token_id  # inject eos-token-id in input ids so that it is located after arbitrary_ngram
-
-        # init cand geenerator with max_matching_ngram_size=1 to match per-token
-        candidate_generator = PromptLookupCandidateGenerator(
-            eos_token_id=eos_token_id, num_output_tokens=4, max_matching_ngram_size=1
-        )
-        output_prompt_lookup = candidate_generator.get_candidates(input_ids)[0]
-
-        # PLD shouldn't propose any new tokens based on eos-match
-        self.assertTrue(output_prompt_lookup.shape[-1] == 10)
-
-    @pytest.mark.generate
-    def test_generate_with_head_masking(self):
-        """Test designed for encoder-decoder models to ensure the attention head masking is used."""
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-            # We want to test only encoder-decoder models
-            if not config.is_encoder_decoder:
-                continue
-            model = model_class(config)
-
-            head_masking = {
-                "head_mask": ops.zeros(config.encoder_layers, config.encoder_attention_heads),
-                "decoder_head_mask": ops.zeros(
-                    config.decoder_layers, config.decoder_attention_heads
-                ),
-                "cross_attn_head_mask": ops.zeros(
-                    config.decoder_layers, config.decoder_attention_heads
-                ),
-            }
-
-            signature = inspect.signature(model.forward)
-            # We want to test only models where encoder/decoder head masking is implemented
-            if not set(head_masking.keys()) < {*signature.parameters.keys()}:
-                continue
-
-            for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
-                out = model.generate(
-                    input_ids,
-                    attention_mask=attention_mask,
-                    num_beams=1,
-                    output_attentions=self.has_attentions,
-                    return_dict_in_generate=True,
-                    remove_invalid_values=True,
-                    **{name: mask},
-                    **inputs_dict,
-                )
-                # We check the state of decoder_attentions and cross_attentions just from the last step
-                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
-                self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
-
-    @pytest.mark.generate
-    def test_left_padding_compatibility(self):
-        # NOTE: left-padding results in small numerical differences. This is expected.
-        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
-
-        # First, filter out models that don't support left padding
-        # - The model must have generative capabilities
-        if len(self.all_generative_model_classes) == 0:
-            self.skipTest(reason="No generative architecture available for this model.")
-
-        # - The model must support padding
-        if not self.has_attentions:
-            self.skipTest(reason="This model doesn't support padding.")
-
-        # - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
-        decoder_only_classes = []
-        for model_class in self.all_generative_model_classes:
-            config, _, _, _ = self._get_input_ids_and_config()
-            if config.is_encoder_decoder:
-                continue
-            else:
-                decoder_only_classes.append(model_class)
-        if len(decoder_only_classes) == 0:
-            self.skipTest(reason="No decoder-only architecture available for this model.")
-
-        # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't
-        #   added support for it yet. We skip these models for now.
-        has_encoder_attributes = any(
-            attr_name
-            for attr_name in config.to_dict().keys()
-            if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
-        )
-        if has_encoder_attributes:
-            self.skipTest(
-                reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding."
-            )
-
-        # Then, test left-padding
-        def _prepare_model_kwargs(input_ids, attention_mask, signature):
-            model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
-            if "position_ids" in signature:
-                position_ids = ops.cumsum(attention_mask, dim=-1) - 1
-                position_ids = position_ids.masked_fill(attention_mask == 0, 1)
-                model_kwargs["position_ids"] = position_ids
-            if "cache_position" in signature:
-                cache_position = ops.arange(input_ids.shape[-1])
-                model_kwargs["cache_position"] = cache_position
-            return model_kwargs
-
-        for model_class in decoder_only_classes:
-            config, input_ids, attention_mask, _ = self._get_input_ids_and_config()
-            model = model_class(config).eval()
-            signature = inspect.signature(model.forward).parameters.keys()
-
-            # no cache as some models require special cache classes to be init outside forward
-            model.generation_config.use_cache = False
-
-            # Without padding
-            model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
-            next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :]
-
-            # With left-padding (length 32)
-            # can hardcode pad_token to be 0 as we'll do attn masking anyway
-            pad_token_id = (
-                config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
-            )
-            pad_size = (input_ids.shape[0], 32)
-            padding = ops.ones(pad_size, dtype=input_ids.dtype) * pad_token_id
-            padded_input_ids = ops.cat((padding, input_ids), dim=1)
-            padded_attention_mask = ops.cat((ops.zeros_like(padding), attention_mask), dim=1)
-            model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
-            next_logits_with_padding = model(**model_kwargs).logits[:, -1, :]
-
-            # They should result in very similar logits
-            self.assertTrue(ops.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
-
-    def test_past_key_values_format(self):
-        # Test that the KV cache is formatted correctly. Exceptions need to explicitly overwrite this test. Having a
-        # standard KV cache format is important for a consistent API (and for advanced generation methods).
-        for model_class in self.all_generative_model_classes:
-            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-
-            # If it doesn't support cache, pass the test
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason="This model doesn't support caching")
-
-            model = model_class(config)
-            if "use_cache" not in inputs:
-                inputs["use_cache"] = True
-            outputs = model(**inputs)
-
-            # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format)
-            if "past_key_values" not in outputs:
-                self.skipTest(reason="This model doesn't return `past_key_values`")
-
-            num_hidden_layers = (
-                getattr(config, "decoder_layers", None)
-                or getattr(config, "num_decoder_layers", None)
-                or config.num_hidden_layers
-            )
-            num_attention_heads = getattr(config, "decoder_attention_heads", config.num_attention_heads)
-            embed_dim = getattr(config, "d_model", config.hidden_size)
-            per_head_embed_dim = embed_dim // num_attention_heads
-
-            past_kv = outputs["past_key_values"]
-            self.assertEqual(len(past_kv), num_hidden_layers)
-
-            # Encoder-Decoder checks
-            if config.is_encoder_decoder:
-                encoder_num_attention_heads = config.encoder_attention_heads
-                encoder_per_head_embed_dim = embed_dim // encoder_num_attention_heads
-                batch_size, seq_length = inputs["decoder_input_ids"].shape
-                for i in range(num_hidden_layers):
-                    self.assertEqual(len(past_kv[i]), 4)  # K V for the decoder + K V for the encoder = 4
-                    self.assertEqual(
-                        past_kv[i][0].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
-                    )
-                    self.assertEqual(
-                        past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
-                    )
-                    # The sequence length for the encoder K V depends on the model. Since it is not manipulated in
-                    # autoregressive generation, I'm keeping the test general and not checking the 3rd dim
-                    self.assertEqual(
-                        (past_kv[i][2].shape[0], past_kv[i][2].shape[1], past_kv[i][2].shape[3]),
-                        (batch_size, encoder_num_attention_heads, encoder_per_head_embed_dim),
-                    )
-                    self.assertEqual(
-                        (past_kv[i][3].shape[0], past_kv[i][3].shape[1], past_kv[i][3].shape[3]),
-                        (batch_size, encoder_num_attention_heads, encoder_per_head_embed_dim),
-                    )
-
-            # Decoder-only checks
-            else:
-                # TODO: this line is only needed because of imagegpt, where "pixel_values" = "input_ids". Fix the
-                # tests in imagegpt such that `prepare_config_and_inputs_for_common` returns the later (and the other
-                # tests use it)
-                key = "input_ids" if "input_ids" in inputs else "pixel_values"
-                batch_size, seq_length = inputs[key].shape
-                for i in range(num_hidden_layers):
-                    self.assertEqual(len(past_kv[0]), 2)  # K V for the decoder = 2
-                    self.assertEqual(
-                        past_kv[i][0].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
-                    )
-                    self.assertEqual(
-                        past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
-                    )
-
-    @pytest.mark.generate
-    def test_generate_from_inputs_embeds_decoder_only(self):
-        # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids`
-        # if fails, you should probably update the `prepare_inputs_for_generation` function
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, _, _ = self._get_input_ids_and_config()
-
-            # Ignore:
-            # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids,
-            #   which would cause a mismatch),
-            config.pad_token_id = config.eos_token_id = -1
-            # b) embedding scaling, the scaling factor applied after embeding from input_ids (requires knowledge of the
-            #   variable that holds the scaling factor, which is model-dependent)
-            if hasattr(config, "scale_embedding"):
-                config.scale_embedding = False
-
-            # This test is for decoder-only models (encoder-decoder models have native input embeddings support in the
-            # decoder)
-            if config.is_encoder_decoder:
-                continue
-
-            # Skip models without explicit support
-            model = model_class(config).eval()
-            if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys():
-                continue
-
-            # Traditional way of generating text
-            outputs_from_ids = model.generate(
-                input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True
-            )
-            self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5))
-
-            # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-            outputs_from_embeds = model.generate(
-                input_ids,
-                inputs_embeds=inputs_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-            self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist())
-
-            # But if we pass different inputs_embeds, we should get different outputs (the output text may be the
-            # same, but the logits will almost surely be different)
-            random_embeds = ops.rand_like(inputs_embeds)
-            outputs_from_rand_embeds = model.generate(
-                input_ids,
-                inputs_embeds=random_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-            for i in range(len(outputs_from_rand_embeds.scores)):
-                self.assertFalse(ops.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i]))
-
-            # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same
-            outputs_from_embeds_wo_ids = model.generate(
-                inputs_embeds=inputs_embeds, max_new_tokens=5, return_dict_in_generate=True, output_scores=True
-            )
-            self.assertListEqual(
-                outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(),
-                outputs_from_embeds_wo_ids.sequences.tolist(),
-            )
-
-    def test_generate_continue_from_past_key_values(self):
-        # Tests that we can continue generating from past key values, returned from a previous `generate` call
-        for model_class in self.all_generative_model_classes:
-            if any(model_name in model_class.__name__.lower() for model_name in ["imagegpt"]):
-                self.skipTest(reason="Won't fix: old model with unique inputs/caches/other")
-            if any(model_name in model_class.__name__.lower() for model_name in ["umt5"]):
-                self.skipTest(reason="TODO: needs modeling or test input preparation fixes for compatibility")
-
-            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason="This model doesn't support caching")
-
-            # Let's make it always:
-            # 1. use cache (for obvious reasons)
-            # 2. generate to max length (which can be achieved by setting the eos token to an invalid value), which
-            #    would make the test flaky (e.g. EOS is generated on iteration 1 on both generations, but the
-            #    continuation would force it to generate beyond an EOS token)
-            # 3. ignore `token_type_ids` for simplicity
-            # 4. ignore `forced_eos_token_id`, which requires further manipulation of the continuation inputs and is
-            #    active by default on some models
-            config.use_cache = True
-            if "token_type_ids" in inputs:
-                del inputs["token_type_ids"]
-
-            model = model_class(config)
-            model.eval()
-            model.generation_config.pad_token_id = model.generation_config.eos_token_id = -1
-            model.generation_config.forced_eos_token_id = None
-            # If "past_key_values" is not returned, skip the test (e.g. RWKV uses a different cache name and format)
-            outputs = model(**inputs)
-            if "past_key_values" not in outputs:
-                self.skipTest(reason="This model doesn't return `past_key_values`")
-
-            # Traditional way of generating text, with `return_dict_in_generate` to return the past key values
-            outputs = model.generate(**inputs, do_sample=False, max_new_tokens=4, return_dict_in_generate=True)
-
-            # Let's generate again, but passing the past key values in between (3 + 1 = 4 tokens). Note that the
-            # inputs may need to be tweaked across `generate` calls (like the attention mask).
-            outputs_cached = model.generate(**inputs, do_sample=False, max_new_tokens=3, return_dict_in_generate=True)
-
-            # Continue from the tokens generated above, preparing the inputs accordingly
-            inputs["past_key_values"] = outputs_cached.past_key_values
-            new_attention_len = outputs_cached.sequences.shape[-1]
-            if config.is_encoder_decoder:
-                inputs["decoder_input_ids"] = outputs_cached.sequences
-                if "decoder_attention_mask" in inputs:
-                    inputs["decoder_attention_mask"] = nn.functional.pad(
-                        inputs["decoder_attention_mask"],
-                        (0, new_attention_len - inputs["decoder_attention_mask"].shape[1]),
-                        mode="constant",
-                        value=1,
-                    )
-            else:
-                inputs["input_ids"] = outputs_cached.sequences
-                if "attention_mask" in inputs:
-                    inputs["attention_mask"] = nn.functional.pad(
-                        inputs["attention_mask"],
-                        (0, new_attention_len - inputs["attention_mask"].shape[1]),
-                        mode="constant",
-                        value=1,
-                    )
-            outputs_cached = model.generate(**inputs, do_sample=False, max_new_tokens=1, return_dict_in_generate=True)
-            # The two sets of generated text and past kv should be equal to each other
-            self.assertListEqual(outputs.sequences.tolist(), outputs_cached.sequences.tolist())
-            for layer_idx in range(len(outputs_cached.past_key_values)):
-                for kv_idx in range(len(outputs_cached.past_key_values[layer_idx])):
-                    self.assertTrue(
-                        ops.allclose(
-                            outputs.past_key_values[layer_idx][kv_idx],
-                            outputs_cached.past_key_values[layer_idx][kv_idx],
-                            1e-3, 1e-3
-                        )
-                    )
-
-    @parameterized.expand([(1, False), (1, True), (4, False)])
-    @pytest.mark.generate
-    def test_new_cache_format(self, num_beams, do_sample):
-        # Tests that generating with the new format is exactly the same as the legacy one (for models that support it).
-        # 👉 tests with and without beam search so that we can test with and without cache reordering.
-        # 👉 tests with and without sampling so we can cover the most common use cases.
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_cache_class:
-                self.skipTest(reason="This model does not support the new cache format")
-
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).eval()
-            generation_kwargs = {
-                "max_new_tokens": 5,
-                "do_sample": do_sample,
-                "num_beams": num_beams,
-                "num_return_sequences": num_beams,
-                "return_dict_in_generate": True,  # Required to return `past_key_values`
-                "use_cache": True,
-            }
-
-            # Sets seed before calling `generate` for the case with do_sample=True
-            seed = ops.randint(0, 1000000, (1,)).item()
-            set_seed(seed)
-            legacy_results = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
-            )
-            set_seed(seed)
-            num_hidden_layers = config.get_text_config().num_hidden_layers
-            if config.is_encoder_decoder:
-                cache_cls = EncoderDecoderCache
-                past_key_values = cache_cls(DynamicCache(num_hidden_layers), DynamicCache(num_hidden_layers))
-            else:
-                cache_cls = DynamicCache
-                past_key_values = cache_cls(num_hidden_layers)
-            new_results = model.generate(
-                input_ids,
-                attention_mask=attention_mask,
-                past_key_values=past_key_values,
-                **generation_kwargs,
-                **inputs_dict,
-            )
-
-            # The two sets of generated sequences must match, despite the cache format between forward passes being
-            # different
-            self.assertListEqual(legacy_results.sequences.tolist(), new_results.sequences.tolist())
-            self.assertTrue(isinstance(legacy_results.past_key_values, tuple))
-            self.assertTrue(isinstance(new_results.past_key_values, cache_cls))
-
-            # The contents of the two caches, when converted to the same format (in both directions!), must match
-            legacy_cache = legacy_results.past_key_values
-            new_cache_converted = new_results.past_key_values.to_legacy_cache()
-            for layer_idx in range(len(legacy_cache)):
-                for kv_idx in range(len(legacy_cache[layer_idx])):
-                    # TODO: @raushan, please look into this for new cache format
-                    if legacy_cache[layer_idx][kv_idx] != []:
-                        self.assertTrue(
-                            ops.allclose(
-                                legacy_cache[layer_idx][kv_idx],
-                                new_cache_converted[layer_idx][kv_idx],
-                            )
-                        )
-
-            new_cache = new_results.past_key_values
-            legacy_cache_converted = cache_cls.from_legacy_cache(legacy_results.past_key_values)
-            for layer_idx in range(len(new_cache)):
-                for kv_idx in range(len(new_cache[layer_idx])):
-                    # TODO: @raushan, please look into this for new cache format
-                    if new_cache[layer_idx][kv_idx] != []:
-                        self.assertTrue(
-                            ops.allclose(
-                                new_cache[layer_idx][kv_idx],
-                                legacy_cache_converted[layer_idx][kv_idx],
-                            )
-                        )
-
-    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
-        batch_size, seq_length = input_ids.shape
-        config = config.text_config if hasattr(config, "text_config") else config
-        num_sequences_in_output = batch_size * num_return_sequences
-
-        gen_len = (
-            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
-        )
-
-        # scores
-        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
-
-        # unprocessed logits
-        self._check_logits(num_sequences_in_output, output.logits, config=config)
-
-        # Attentions
-        if self.has_attentions:
-            if config.is_encoder_decoder:
-                # encoder
-                self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length)
-                # decoder
-                self._check_attentions_for_generate(
-                    num_sequences_in_output,
-                    output.decoder_attentions,
-                    min_length=1,
-                    max_length=output.sequences.shape[-1],
-                    config=config,
-                    use_cache=use_cache,
-                )
-            else:
-                # if use_cache first input is equal to no use_cache, so skip here
-                attentions = output.attentions if not use_cache else output.attentions[1:]
-                min_length = seq_length if not use_cache else seq_length + 1
-                self._check_attentions_for_generate(
-                    num_sequences_in_output,
-                    attentions=attentions,
-                    min_length=min_length,
-                    max_length=output.sequences.shape[-1],
-                    config=config,
-                    use_cache=use_cache,
-                )
-
-        # Hidden States
-        if config.is_encoder_decoder:
-            # encoder
-            self._check_encoder_hidden_states_for_generate(
-                output.encoder_hidden_states, batch_size, config, seq_length
-            )
-
-            # decoder
-            self._check_hidden_states_for_generate(
-                num_sequences_in_output,
-                output.decoder_hidden_states,
-                min_length=1,
-                max_length=output.sequences.shape[-1],
-                config=config,
-                use_cache=use_cache,
-            )
-        else:
-            # if use_cache first input is equal to no use_cache, so skip here
-            hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
-            min_length = seq_length if not use_cache else seq_length + 1
-            self._check_hidden_states_for_generate(
-                num_sequences_in_output,
-                hidden_states,
-                min_length=min_length,
-                max_length=output.sequences.shape[-1],
-                config=config,
-                use_cache=use_cache,
-            )
-
-        # Past Key Value States -- a few notes here:
-        # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1"
-        # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the
-        #    standard cache format (e.g.gptbigcode )
-        models_without_standard_cache = ("ctrl", "fsmt", "gptbigcode", "mega", "reformer", "jamba", "mamba", "xlnet")
-        has_standard_cache = not any(
-            model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache
-        )
-        if has_standard_cache:
-            if use_cache:
-                past_key_values = output.past_key_values
-                past_sequence_length = output.sequences.shape[-1] - 1
-                self._check_past_key_values_for_generate(
-                    num_sequences_in_output,
-                    past_key_values,
-                    seq_length=past_sequence_length,
-                    config=config,
-                )
-            elif use_cache is False:
-                self.assertTrue(output.past_key_values is None)
-
-    def _check_scores(self, batch_size, scores, length, config):
-        expected_shape = (batch_size, config.vocab_size)
-        self.assertIsInstance(scores, tuple)
-        self.assertEqual(len(scores), length)
-        self.assertListEqual([iter_scores.shape for iter_scores in scores], [expected_shape] * len(scores))
-
-    def _check_logits(self, batch_size, scores, config):
-        self.assertIsInstance(scores, tuple)
-        self.assertListEqual([iter_scores.shape[0] for iter_scores in scores], [batch_size] * len(scores))
-        # vocabulary difference equal to one (imagegptmodel?) or zero (all other models)
-        vocab_diff = config.vocab_size - scores[0].shape[-1]
-        self.assertTrue(vocab_diff in [0, 1])
-        self.assertListEqual([config.vocab_size - score.shape[-1] for score in scores], [vocab_diff] * len(scores))
-
-    def _check_attentions_for_generate(
-        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        self.assertIsInstance(attentions, tuple)
-        self.assertListEqual(
-            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
-        )
-        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
-
-        for idx, iter_attentions in enumerate(attentions):
-            tgt_len = min_length + idx if not use_cache else 1
-            src_len = min_length + idx
-
-            expected_shape = (
-                batch_size * num_beam_groups,
-                config.num_attention_heads,
-                tgt_len,
-                src_len,
-            )
-            # check attn size
-            self.assertListEqual(
-                [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions)
-            )
-
-    def _check_encoder_attention_for_generate(self, attentions, batch_size, config, seq_length):
-        encoder_expected_shape = (batch_size, config.num_attention_heads, seq_length, seq_length)
-        self.assertIsInstance(attentions, tuple)
-        self.assertListEqual(
-            [layer_attentions.shape for layer_attentions in attentions],
-            [encoder_expected_shape] * len(attentions),
-        )
-
-    def _check_hidden_states_for_generate(
-        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        self.assertIsInstance(hidden_states, tuple)
-        self.assertListEqual(
-            [isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states],
-            [True] * len(hidden_states),
-        )
-        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
-
-        for idx, iter_hidden_states in enumerate(hidden_states):
-            seq_len = min_length + idx if not use_cache else 1
-            expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
-            # check hidden size
-            self.assertListEqual(
-                [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states],
-                [expected_shape] * len(iter_hidden_states),
-            )
-
-    def _check_encoder_hidden_states_for_generate(self, hidden_states, batch_size, config, seq_length):
-        encoder_expected_shape = (batch_size, seq_length, config.hidden_size)
-        self.assertIsInstance(hidden_states, tuple)
-        self.assertListEqual(
-            [layer_hidden_states.shape for layer_hidden_states in hidden_states],
-            [encoder_expected_shape] * len(hidden_states),
-        )
-
-    def _check_past_key_values_for_generate(self, batch_size, past_key_values, seq_length, config, num_beam_groups=1):
-        self.assertIsInstance(past_key_values, tuple)
-        self.assertListEqual(
-            [isinstance(iter_past_key_values, tuple) for iter_past_key_values in past_key_values],
-            [True] * len(past_key_values),
-        )
-
-        # (batch, head, seq_length, head_features)
-        expected_shape = (
-            batch_size * num_beam_groups,
-            config.num_key_value_heads if hasattr(config, "num_key_value_heads") else config.num_attention_heads,
-            seq_length,
-            config.hidden_size // config.num_attention_heads,
-        )
-        # check shape key, value
-        self.assertListEqual(
-            [layer_past_key_values[0].shape for layer_past_key_values in past_key_values],
-            [expected_shape] * len(past_key_values),
-        )
-        self.assertListEqual(
-            [layer_past_key_values[1].shape for layer_past_key_values in past_key_values],
-            [expected_shape] * len(past_key_values),
-        )
-
-    def _check_sequence_inside_sequence(self, tensor_1, tensor_2):
-        # check if tensor_1 inside tensor_2 or tensor_2 inside tensor_1.
-        # set to same device. we don't care what device.
-
-        if not isinstance(tensor_1, list):
-            tensor_1 = tensor_1.tolist()
-        if not isinstance(tensor_2, list):
-            tensor_2 = tensor_2.tolist()
-
-        in_order = len(tensor_1) <= len(tensor_2)
-        longer = tensor_2 if in_order else tensor_1
-        shorter = tensor_1 if in_order else tensor_2
-
-        flag = False
-        chunk_size = len(shorter)
-        for chunk_idx in range(len(longer) - chunk_size + 1):
-            subseq = longer[chunk_idx : chunk_idx + chunk_size]
-            if subseq == shorter:
-                flag = True
-                break
-
-        self.assertTrue(flag)
-
-
-@require_mindspore
-class UtilsFunctionsTest(unittest.TestCase):
-    def test_speculative_sampling(self):
-        # assume vocab size 10, input length 5 + 3 generated candidates
-        candidate_input_ids = mindspore.tensor([[8, 0, 3, 9, 8, 1, 4, 5]])  # input tokens
-        candidate_logits = mindspore.tensor(
-            [
-                [
-                    [-10.0, 10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0],  # generated 1
-                    [-10.0, -10.0, -10.0, -10.0, 10.0, -10.0, -10.0, -10.0, -10.0, -10.0],  # generated 4
-                    [-10.0, -10.0, -10.0, -10.0, -10.0, 10.0, -10.0, -10.0, -10.0, -10.0],  # generated 5
-                ]
-            ]
-        )
-        candidate_length = 3
-        inf = float("inf")
-        new_logits = mindspore.tensor(
-            [
-                [
-                    [-10.0, 10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0],  # accepts 1
-                    [-10.0, -10.0, -10.0, -10.0, 10.0, -10.0, -10.0, -10.0, -10.0, -10.0],  # accepts 4
-                    [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, 10.0, -inf],  # rejects 5, accepts 8
-                    [-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0],  # N/A
-                ]
-            ]
-        )
-        last_assistant_token_is_eos = False
-        validated_tokens, n_matches = _speculative_sampling(
-            candidate_input_ids,
-            candidate_logits,
-            candidate_length,
-            new_logits,
-            last_assistant_token_is_eos,
-        )
-        self.assertTrue(n_matches.item() == 2)
-        self.assertTrue(validated_tokens.tolist()[0] == [1, 4, 8])
-
-
-@require_mindspore
-class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin):
-    # setting framework_dependent_parameters needs to be gated, just like its contents' imports
-    if is_mindspore_available():
-        framework_dependent_parameters = {
-            "AutoModelForCausalLM": AutoModelForCausalLM,
-            "AutoModelForSpeechSeq2Seq": AutoModelForSpeechSeq2Seq,
-            "AutoModelForSeq2SeqLM": AutoModelForSeq2SeqLM,
-            "AutoModelForVision2Seq": AutoModelForVision2Seq,
-            "LogitsProcessorList": LogitsProcessorList,
-            "MinLengthLogitsProcessor": MinLengthLogitsProcessor,
-            "create_tensor_fn": mindspore.tensor,
-            "floats_tensor": floats_tensor,
-            "return_tensors": "ms",
-        }
-
-    @slow
-    def test_diverse_beam_search(self):
-        # PT-only test: TF doesn't have a diverse beam search implementation
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood.
-        The celebrity couple announced the arrival of their son, Silas Randall Timberlake, in statements to People.
-        "Silas was the middle name of Timberlake's maternal grandfather Bill Bomar, who died in 2012, while Randall is the musician's own middle name, as well as his father's first," People reports.
-        The couple announced the pregnancy in January, with an Instagram post. It is the first baby for both."""
-
-        bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
-        input_ids = bart_tokenizer(article, return_tensors="ms").input_ids
-
-        outputs = bart_model.generate(
-            input_ids,
-            num_beams=4,
-            num_return_sequences=2,
-            num_beam_groups=4,
-            diversity_penalty=2.0,
-            remove_invalid_values=True,
-        )
-
-        generated_text = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "The couple announced the birth of their son, Silas Randall Timberlake, in a statement. Silas was the"
-                " middle name of Timberlake's maternal grandfather Bill Bomar. Randall is the musician's own middle"
-                " name, as well as his father's first. It is the first baby for both of them.",
-                "Justin Timberlake and Jessica Biel have a son. The baby is named Silas Randall Timberlake. It is the"
-                " first child for both. The couple announced the pregnancy in January. The name Silas is the middle"
-                " name of Timberlake's maternal grandfather. It's also his own middle name.",
-            ],
-        )
-
-    def test_max_length_if_input_embeds(self):
-        # PT-only test: TF doesn't have StoppingCriteria
-        article = "Today a dragon flew over Paris."
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        input_ids = tokenizer(article, return_tensors="ms").input_ids
-        inputs_embeds = model.get_input_embeddings()(input_ids)
-
-        max_length = 20
-        input_len = input_ids.shape[-1]
-        out_gen = model.generate(input_ids=input_ids, max_length=max_length)
-        out_gen_embeds = model.generate(inputs_embeds=inputs_embeds, max_length=max_length)
-        self.assertEqual(out_gen.shape[-1], input_len + out_gen_embeds.shape[-1])
-
-    def test_min_length_if_input_embeds(self):
-        # PT-only test: TF doesn't have StoppingCriteria
-        article = "Today a dragon flew over Paris."
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        input_ids = tokenizer(article, return_tensors="ms").input_ids
-        inputs_embeds = model.get_input_embeddings()(input_ids)
-
-        min_length = 10
-        input_len = input_ids.shape[-1]
-        out_gen = model.generate(input_ids=input_ids, min_length=min_length)
-        out_gen_embeds = model.generate(inputs_embeds=inputs_embeds, min_length=min_length)
-        self.assertEqual(out_gen.shape[-1], input_len + out_gen_embeds.shape[-1])
-
-    def test_custom_stopping_criteria_overload_error(self):
-        # PT-only test: TF doesn't have StoppingCriteria
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
-        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random")
-
-        input_ids = bart_tokenizer(article, return_tensors="ms").input_ids
-        stopping_criteria = StoppingCriteriaList()
-        stopping_criteria.append(MaxLengthCriteria(max_length=42))
-        with self.assertRaises(ValueError):
-            bart_model.generate(input_ids, stopping_criteria=stopping_criteria)
-        with self.assertRaises(ValueError):
-            bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=32)
-
-    def test_custom_stopping_criteria(self):
-        # PT-only test: TF doesn't have StoppingCriteria
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
-        bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random")
-        input_ids = bart_tokenizer(article, return_tensors="ms").input_ids
-
-        class DummyCriteria(StoppingCriteria):
-            def __call__(self, input_ids: mindspore.Tensor, scores: mindspore.Tensor, **kwargs) -> bool:
-                return input_ids.shape[-1] >= 20
-
-        stopping_criteria = StoppingCriteriaList()
-        stopping_criteria.append(DummyCriteria())
-
-        self.assertEqual(
-            list(bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=22).shape),
-            [1, 20],
-        )
-        self.assertEqual(
-            list(bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=18).shape),
-            [1, 18],
-        )
-
-    # TODO (joao): replace `stop_sequence` in the pipeline by the more recent `generate` functionality
-    def test_stop_sequence_stopping_criteria(self):
-        # PT-only test: TF doesn't have StoppingCriteria
-        prompt = """Hello I believe in"""
-        generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-bart")
-        output = generator(prompt)
-        self.assertEqual(
-            output,
-            [{"generated_text": ("Hello I believe in we we we we we we we we we")}],
-        )
-
-        output = generator(prompt, stop_sequence=" we")
-        self.assertEqual(output, [{"generated_text": "Hello I believe in we"}])
-
-    def test_generate_non_nlp_input_ids_as_kwarg(self):
-        # PT-only test: AFAIK there's no non-NLP model architecture in TF that supports `input_ids` as its only input
-        model = ImageGPTForCausalImageModeling.from_pretrained(
-            "hf-internal-testing/tiny-random-imagegpt", max_length=10
-        )
-        input_ids = ids_tensor((3, 5), vocab_size=10)
-
-        output_sequences_kwargs = model.generate(input_ids=input_ids)
-        output_sequences = model.generate(input_ids)
-
-        self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
-        self.assertEqual(output_sequences.shape, (3, 10))
-
-    def test_generate_input_values_as_encoder_kwarg(self):
-        # PT-only test: AFAIK there's no generate-capable architecture in TF that supports `input_values` as its input
-        input_values = floats_tensor((2, 250))
-        model = SpeechEncoderDecoderModel.from_pretrained("hf-internal-testing/tiny-random-speech-encoder-decoder")
-        model = model
-        output_sequences_kwargs = model.generate(input_values=input_values, max_length=5)
-        output_sequences = model.generate(input_values, max_length=5)
-
-        self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
-        self.assertEqual(output_sequences.shape, (2, 5))
-
-    def test_transition_scores_group_beam_search_encoder_decoder(self):
-        # PT-only test: TF doesn't have group beam search
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = BartForConditionalGeneration.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            max_length=10,
-            num_beams=2,
-            num_beam_groups=2,
-            num_return_sequences=2,
-            diversity_penalty=1.0,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        model = model
-
-        input_ids = tokenizer(articles, return_tensors="ms", padding=True).input_ids
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        transition_scores_sum = transition_scores.sum(-1)
-
-        self.assertTrue(ops.allclose(transition_scores_sum, outputs.sequences_scores, atol=1e-3))
-
-    def test_beam_search_low_memory(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-        model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-        model_inputs = tokenizer("I", return_tensors="ms")["input_ids"]
-
-        low_output = model.generate(model_inputs, max_new_tokens=40, num_beams=5, early_stopping=True, low_memory=True)
-
-        high_output = model.generate(
-            model_inputs, max_new_tokens=40, num_beams=5, early_stopping=True, low_memory=False
-        )
-        self.assertListEqual(low_output.tolist(), high_output.tolist())
-
-    @slow
-    def test_watermark_generation(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-        model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-        model_inputs = tokenizer("I will be", return_tensors="ms")
-        input_len = model_inputs["input_ids"].shape[-1]
-
-        # generation should work with both input types: WatermarkingConfig or Dict, so let's check it here :)
-        watermark_config = WatermarkingConfig(bias=2.5, seeding_scheme="selfhash")
-        _ = model.generate(**model_inputs, watermarking_config=watermark_config, do_sample=False, max_length=15)
-
-        # We will not check watermarked text, since we check it in `logits_processors` tests
-        # Checking if generated ids are as expected fails on different hardware
-        args = {
-            "bias": 2.0,
-            "context_width": 1,
-            "seeding_scheme": "selfhash",
-            "greenlist_ratio": 0.25,
-            "hashing_key": 15485863,
-        }
-        output = model.generate(**model_inputs, do_sample=False, max_length=15)
-        output_selfhash = model.generate(**model_inputs, watermarking_config=args, do_sample=False, max_length=15)
-
-        # Check that the detector is detecting watermarked text
-        detector = WatermarkDetector(model_config=model.config, watermarking_config=args)
-        detection_out_watermarked = detector(output_selfhash[:, input_len:], return_dict=True)
-        detection_out = detector(output[:, input_len:], return_dict=True)
-
-        self.assertListEqual(detection_out_watermarked.prediction.tolist(), [True])
-        self.assertListEqual(detection_out.prediction.tolist(), [False])
-
-    @slow
-    def test_beam_search_example_integration(self):
-        # PT-only test: TF doesn't have a BeamSearchScorer
-        # exactly the example provided in the docstrings of beam search, which previously
-        # failed after directly copying from it. Refer to PR #15555
-        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
-        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
-
-        encoder_input_str = "translate English to German: How old are you?"
-        encoder_input_ids = tokenizer(encoder_input_str, return_tensors="ms").input_ids
-
-        # lets run beam search using 3 beams
-        num_beams = 3
-        # define decoder start token ids
-        input_ids = ops.ones((1, 1), dtype=mindspore.int64)
-        input_ids = input_ids * model.config.decoder_start_token_id
-
-        # add encoder_outputs to model keyword arguments
-        model_kwargs = {"encoder_outputs": model.get_encoder()(encoder_input_ids, return_dict=True)}
-
-        outputs = model.generate(
-            input_ids, num_beams=num_beams, min_length=5, eos_token_id=model.config.eos_token_id, **model_kwargs
-        )
-        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(outputs, ["Wie alt bist du?"])
-
-    @slow
-    def test_constrained_beam_search(self):
-        # PT-only test: TF doesn't have constrained beam search
-        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-
-        force_tokens = tokenizer("scared", add_prefix_space=True, add_special_tokens=False).input_ids
-        force_tokens_2 = tokenizer("big weapons", add_prefix_space=True, add_special_tokens=False).input_ids
-
-        constraints = [
-            PhrasalConstraint(force_tokens),
-            PhrasalConstraint(force_tokens_2),
-        ]
-
-        starting_text = ["The soldiers were not prepared and"]
-
-        input_ids = tokenizer(starting_text, return_tensors="ms").input_ids
-
-        outputs = model.generate(
-            input_ids,
-            constraints=constraints,
-            num_beams=10,
-            num_return_sequences=1,
-            no_repeat_ngram_size=1,
-            max_length=30,
-            remove_invalid_values=True,
-        )
-
-        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "The soldiers were not prepared and didn't know what to do. They had no idea how they would react if"
-                " the enemy attacked them, big weapons scared"
-            ],
-        )
-
-    @slow
-    def test_constrained_beam_search_mixed(self):
-        # PT-only test: TF doesn't have constrained beam search
-        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-
-        force_phrase = tokenizer("scared", add_prefix_space=True, add_special_tokens=False).input_ids
-        flexible_phrases = tokenizer(
-            ["scream", "screams", "screaming", "screamed"], add_prefix_space=True, add_special_tokens=False
-        ).input_ids
-
-        constraints = [
-            PhrasalConstraint(force_phrase),
-            DisjunctiveConstraint(flexible_phrases),
-        ]
-
-        starting_text = ["The soldiers", "The child"]
-
-        input_ids = tokenizer(starting_text, return_tensors="ms").input_ids
-
-        outputs = model.generate(
-            input_ids,
-            constraints=constraints,
-            num_beams=10,
-            num_return_sequences=1,
-            no_repeat_ngram_size=1,
-            # max_length=20,
-            remove_invalid_values=True,
-        )
-
-        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "The soldiers, who had been stationed at the base for more than a year before being evacuated"
-                " screaming scared",
-                "The child was taken to a local hospital where he died.\n 'I don't think screaming scared",
-            ],
-        )
-
-    @slow
-    def test_constrained_beam_search_mixed_mixin(self):
-        # PT-only test: TF doesn't have constrained beam search
-        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-
-        force_word = "scared"
-        force_flexible = ["scream", "screams", "screaming", "screamed"]
-
-        force_words_ids = [
-            tokenizer([force_word], add_prefix_space=True, add_special_tokens=False).input_ids,
-            tokenizer(force_flexible, add_prefix_space=True, add_special_tokens=False).input_ids,
-        ]
-
-        starting_text = ["The soldiers", "The child"]
-
-        input_ids = tokenizer(starting_text, return_tensors="ms").input_ids
-
-        outputs = model.generate(
-            input_ids,
-            force_words_ids=force_words_ids,
-            num_beams=10,
-            num_return_sequences=1,
-            no_repeat_ngram_size=1,
-            remove_invalid_values=True,
-        )
-
-        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "The soldiers, who had been stationed at the base for more than a year before being evacuated"
-                " screaming scared",
-                "The child was taken to a local hospital where he died.\n 'I don't think screaming scared",
-            ],
-        )
-
-    @slow
-    def test_cfg_mixin(self):
-        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-
-        input = tokenizer(["The dragon flew over Paris,"], return_tensors="ms", return_attention_mask=True)
-        input["input_ids"] = input["input_ids"]
-        input["attention_mask"] = input["attention_mask"]
-
-        outputs = model.generate(**input, max_new_tokens=32, guidance_scale=1.5)
-        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "The dragon flew over Paris, landing in the Rue de la Bastille. The crowd was so excited "
-                'that they had to leave the city.\n\n"We\'re going to Paris!"\n'
-            ],
-        )
-
-        neg = tokenizer(["France,"], return_tensors="ms", return_attention_mask=True)
-        neg["input_ids"] = neg["input_ids"]
-        neg["attention_mask"] = neg["attention_mask"]
-        outputs = model.generate(
-            **input,
-            max_new_tokens=32,
-            guidance_scale=1.5,
-            negative_prompt_ids=neg["input_ids"],
-            negative_prompt_attention_mask=neg["attention_mask"],
-        )
-        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                'The dragon flew over Paris, landing on the pavement.\n\n"Paris!"\n\n"Paris!"\n\n"'
-                'Paris!"\n\n"Paris!"\n\n"Paris!"\n\n'
-            ],
-        )
-
-    @slow
-    def test_constrained_beam_search_example_translation_mixin(self):
-        # PT-only test: TF doesn't have constrained beam search
-        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
-        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
-
-        encoder_input_str = "translate English to German: How old are you?"
-        force_words = ["sind"]
-
-        input_ids = tokenizer(encoder_input_str, return_tensors="ms").input_ids
-        force_words_ids = tokenizer(force_words, add_special_tokens=False).input_ids
-
-        outputs = model.generate(
-            input_ids,
-            force_words_ids=force_words_ids,
-            num_beams=10,
-            num_return_sequences=1,
-            no_repeat_ngram_size=1,
-            remove_invalid_values=True,
-        )
-
-        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(outputs, ["Wie alt sind Sie?"])
-
-    @slow
-    def test_constrained_beam_search_example_integration(self):
-        # PT-only test: TF doesn't have constrained beam search
-        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
-        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
-
-        encoder_input_str = "translate English to German: How old are you?"
-        encoder_input_ids = tokenizer(encoder_input_str, return_tensors="ms").input_ids
-
-        # lets run beam search using 5 beams
-        num_beams = 5
-        # define decoder start token ids
-        input_ids = ops.ones((1, 1), dtype=mindspore.int64)
-        input_ids = input_ids * model.config.decoder_start_token_id
-
-        # add encoder_outputs to model keyword arguments
-        model_kwargs = {"encoder_outputs": model.get_encoder()(encoder_input_ids, return_dict=True)}
-
-        constraint_str = "sind"
-        constraint_token_ids = tokenizer.encode(constraint_str)[:-1]  # remove eos token
-
-        outputs = model.generate(
-            input_ids,
-            num_beams=num_beams,
-            force_words_ids=[constraint_token_ids],
-            min_length=5,
-            eos_token_id=model.config.eos_token_id,
-            **model_kwargs,
-        )
-        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(outputs, ["Wie alt sind Sie?"])
-
-    @slow
-    def test_per_row_stopping_criteria(self):
-        text = [
-            "They completed the challenging puzzle, revealing the hidden",
-            "Today a dragon flew over France",
-            "The aroma of freshly baked pizza filled the kitchen",
-        ]
-        stop_strings = ["secrets"]
-
-        model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
-        tokenizer.padding_side = "left"
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-        input_ids = tokenizer(text, return_tensors="ms", padding="longest", add_special_tokens=False).input_ids
-
-        # normal generation with one stopping criteria
-        out = model.generate(input_ids, max_length=15)
-        out_text = tokenizer.batch_decode(out)
-        expected_out = [
-            "They completed the challenging puzzle, revealing the hidden secrets of the world.\n",
-            "<|endoftext|><|endoftext|><|endoftext|>Today a dragon flew over France and the French government was forced",
-            "The aroma of freshly baked pizza filled the kitchen with a sense of freshness",
-        ]
-        self.assertListEqual(out_text, expected_out)
-
-        # generation should stop at "secrets" for first batch only, filling the rest with eos tokens
-        out = model.generate(input_ids, max_length=15, stop_strings=stop_strings, tokenizer=tokenizer)
-        out_text = tokenizer.batch_decode(out)
-        expected_out = [
-            "They completed the challenging puzzle, revealing the hidden secrets<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>",
-            "<|endoftext|><|endoftext|><|endoftext|>Today a dragon flew over France and the French government was forced",
-            "The aroma of freshly baked pizza filled the kitchen with a sense of freshness",
-        ]
-        self.assertListEqual(out_text, expected_out)
-
-    def test_constrained_beam_search_mixin_type_checks(self):
-        # PT-only test: TF doesn't have constrained beam search
-        tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/t5-tiny-random")
-        model = AutoModelForSeq2SeqLM.from_pretrained("patrickvonplaten/t5-tiny-random")
-
-        encoder_input_str = "translate English to German: How old are you?"
-        input_ids = tokenizer(encoder_input_str, return_tensors="ms").input_ids
-
-        with self.assertRaises(ValueError):
-            force_words = ["sind"]
-            force_words_ids = tokenizer(force_words, return_tensors="ms").input_ids
-            model.generate(
-                input_ids,
-                force_words_ids=force_words_ids,
-                num_beams=10,
-                num_return_sequences=1,
-                no_repeat_ngram_size=1,
-                remove_invalid_values=True,
-            )
-
-        with self.assertRaises(ValueError):
-            force_words = ["sind"]
-            force_words_ids = [tokenizer(force_words, return_tensors="ms").input_ids]
-            model.generate(
-                input_ids,
-                force_words_ids=force_words_ids,
-                num_beams=10,
-                num_return_sequences=1,
-                no_repeat_ngram_size=1,
-                remove_invalid_values=True,
-            )
-
-        with self.assertRaises(ValueError):
-            model.generate(input_ids, force_words_ids=[])
-
-        with self.assertRaises(ValueError):
-            model.generate(input_ids, force_words_ids=[[-1]])
-
-        with self.assertRaises(ValueError):
-            model.generate(input_ids, force_words_ids=[[[-1]]])
-
-    def test_batched_decoder_start_id(self):
-        # PT-only test: TF doesn't support batched_decoder_start_id
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart")
-        input_ids = bart_tokenizer(articles, return_tensors="ms", padding=True).input_ids
-        decoder_start_token_id = bart_model.generation_config.decoder_start_token_id
-        decoder_start_token_id_batch = [decoder_start_token_id] * input_ids.shape[0]
-
-        outputs = bart_model.generate(input_ids, decoder_start_token_id=decoder_start_token_id)
-
-        outputs_batched_ids = bart_model.generate(input_ids, decoder_start_token_id=decoder_start_token_id_batch)
-
-        self.assertListEqual(outputs.tolist(), outputs_batched_ids.tolist())
-
-    def test_decoder_start_id_from_config(self):
-        # Refer to: (#30899)
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart")
-        input_ids = bart_tokenizer(articles, return_tensors="ms", padding=True).input_ids
-        decoder_start_token_id = bart_model.generation_config.decoder_start_token_id
-
-        # we should be able to take `decoder_start_token_id` from model's generation config if user passes a `GenerationConfig` type
-        outputs = bart_model.generate(input_ids, generation_config=GenerationConfig(do_sample=False))
-
-        # If the generatoin config has no `decoder_start_token_id` or `bos_token_id`, we will raise an error unless user passes it in config
-        bart_model.generation_config.decoder_start_token_id = None
-        bart_model.generation_config.bos_token_id = None
-        outputs_with_user_id = bart_model.generate(
-            input_ids,
-            generation_config=GenerationConfig(do_sample=False, decoder_start_token_id=decoder_start_token_id),
-        )
-
-        self.assertListEqual(outputs.tolist(), outputs_with_user_id.tolist())
-
-        with self.assertRaises(ValueError):
-            outputs = bart_model.generate(input_ids, generation_config=GenerationConfig(do_sample=False))
-
-    def test_contrastive_search_batched(self):
-        # PT-only test: TF doesn't have constrained beam search
-        # Tests that contrastive search works with batched inputs (i.e. has the same output as for non-batched inputs)
-        articles = ["Foo", "Bar Baz"]
-        tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        model.config.eos_token_id = None
-        input_ids_batched = tokenizer(articles, padding=True, return_tensors="ms").input_ids
-        input_ids = tokenizer(articles[1], return_tensors="ms").input_ids
-
-        output_sequences_batched = model.generate(
-            input_ids=input_ids_batched, penalty_alpha=0.6, top_k=4, return_dict_in_generate=True, output_scores=True
-        )
-        output_sequences = model.generate(
-            input_ids=input_ids, penalty_alpha=0.6, top_k=4, return_dict_in_generate=True, output_scores=True
-        )
-
-        batched_out = tokenizer.decode(output_sequences_batched.sequences[1], skip_special_tokens=True)
-        out = tokenizer.decode(output_sequences.sequences[0], skip_special_tokens=True)
-        self.assertEqual(batched_out, out)
-
-        # output_sequences_batched.scores[0][1] -> 1st set of logits, 2nd sequence
-        max_score_diff = (output_sequences_batched.scores[0][1] - output_sequences.scores[0][0]).abs().max()
-        self.assertTrue(max_score_diff < 1e-5)
-
-    def test_logits_processor_not_inplace(self):
-        # PT-only test: TF fixes were not made
-        article = "Today a dragon flew over Paris."
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        input_ids = tokenizer(article, return_tensors="ms").input_ids
-
-        out = model.generate(input_ids, output_logits=True, output_scores=True, return_dict_in_generate=True)
-        out_with_temp = model.generate(
-            input_ids,
-            temperature=0.5,
-            do_sample=True,
-            output_logits=True,
-            output_scores=True,
-            return_dict_in_generate=True,
-        )
-
-        # if no logits processor is used, scores == logits. Otherwise, the processor has to modify the scores
-        self.assertListEqual(out.logits[-1].tolist(), out.scores[-1].tolist())
-        self.assertNotEqual(out_with_temp.logits[-1].tolist(), out_with_temp.scores[-1].tolist())
-
-    def test_eos_token_id_int_and_list_top_k_top_sampling(self):
-        # Has TF equivalent: this test relies on random sampling
-        generation_kwargs = {
-            "do_sample": True,
-            "num_beams": 1,
-            "top_p": 0.7,
-            "top_k": 10,
-            "temperature": 0.7,
-        }
-        expectation = 20
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors="ms")
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-
-        # Only some seeds will work both on CPU/GPU for a fixed `expectation` value.
-        # The selected seed is not guaranteed to work on all torch versions.
-        set_seed(1)
-        eos_token_id = 846
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-        set_seed(1)
-        eos_token_id = [846, 198]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-    def test_model_kwarg_encoder_signature_filtering(self):
-        # Has TF equivalent: ample use of framework-specific code
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        article = """Hugging Face is a technology company based in New York and Paris."""
-        input_ids = bart_tokenizer(article, return_tensors="ms").input_ids
-        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart")
-        output = bart_model.generate(input_ids).numpy()
-
-        # Let's create a fake model that has a different signature. In particular, this fake model accepts "foo" as an
-        # argument. Because "foo" is not in the encoder signature and doesn't start with "decoder_", it will be part of
-        # the encoder kwargs prior to signature filtering, which would lead to an exception. But filtering kicks in and
-        # saves the day.
-        class FakeBart(BartForConditionalGeneration):
-            def forward(self, input_ids, foo=None, **kwargs):
-                return super().forward(input_ids, **kwargs)
-
-        bart_model = FakeBart.from_pretrained("hf-internal-testing/tiny-random-bart")
-        fake_output = bart_model.generate(input_ids, foo="bar").numpy()
-        self.assertTrue(np.array_equal(output, fake_output))
-
-        # Encoder signature filtering only kicks in if it doesn't accept wildcard kwargs. The following test will fail
-        # because it doesn't do signature filtering.
-        class FakeEncoder(bart_model.model.encoder.__class__):
-            def forward(self, input_ids, **kwargs):
-                return super().forward(input_ids, **kwargs)
-
-        fake_encoder = FakeEncoder(bart_model.config, bart_model.model.shared)
-        bart_model.model.encoder = fake_encoder
-
-        # Normal generation still works (the output will be different because the encoder weights are different)
-        fake_output = bart_model.generate(input_ids).numpy()
-        with self.assertRaises(TypeError):
-            # FakeEncoder.forward() accepts **kwargs -> no filtering -> type error due to unexpected input "foo"
-            bart_model.generate(input_ids, foo="bar")
-
-    def test_default_max_length_warning(self):
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model.config.pad_token_id = tokenizer.eos_token_id
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="ms")
-        input_ids = tokenized_inputs.input_ids
-
-        # Default generation config value of 20 -> emits warning
-        with self.assertWarns(UserWarning):
-            model.generate(input_ids)
-
-        # Explicitly setting max_length to 20 -> no warning
-        with warnings.catch_warnings(record=True) as warning_list:
-            model.generate(input_ids, max_length=20)
-            self.assertEqual(len(warning_list), 0)
-
-        # Generation config max_length != 20 -> no warning
-        with warnings.catch_warnings(record=True) as warning_list:
-            # generation_config is modified -> legacy mode is disabled = generation_config takes precedence
-            model.generation_config.max_length = 10
-            model.generate(input_ids)
-            self.assertEqual(len(warning_list), 0)
-
-    def test_length_warning_assisted_generation(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model.config.pad_token_id = tokenizer.eos_token_id
-        assistant.config.pad_token_id = tokenizer.eos_token_id
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="ms")
-        input_ids = tokenized_inputs.input_ids
-
-        # This should not raise any warning that min length is not feasible in candidate generation
-        with warnings.catch_warnings(record=True) as warning_list:
-            model.generate(
-                input_ids,
-                assistant_model=assistant,
-                min_new_tokens=10,
-                max_length=20,
-            )
-            self.assertEqual(len(warning_list), 0)
-
-    def test_generated_length_assisted_generation(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model.config.pad_token_id = tokenizer.eos_token_id
-        assistant.config.pad_token_id = tokenizer.eos_token_id
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="ms")
-        input_ids = tokenized_inputs.input_ids
-        input_length = input_ids.shape[-1]
-
-        out = model.generate(
-            input_ids,
-            assistant_model=assistant,
-            min_new_tokens=10,
-            max_new_tokens=20,
-        )
-        self.assertTrue((10 + input_length) <= out.shape[-1] <= (20 + input_length))
-
-        out = model.generate(
-            input_ids,
-            assistant_model=assistant,
-            min_new_tokens=10,
-        )
-        self.assertTrue((input_length + 10) <= out.shape[-1] <= 20)
-
-    def test_model_kwarg_assisted_decoding_decoder_only(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model.config.pad_token_id = tokenizer.eos_token_id
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="ms")
-        input_ids = tokenized_inputs.input_ids
-
-        # Traditional way of generating text
-        outputs_normal = model.generate(input_ids)
-        self.assertEqual(outputs_normal.shape, (1, 20))
-
-        # Should be different with token_type_ids
-        outputs_tti = model.generate(
-            input_ids,
-            token_type_ids=ops.zeros(input_ids.shape, dtype=mindspore.int64),
-        )
-        with self.assertRaises(AssertionError):
-            self.assertListEqual(outputs_tti.tolist(), outputs_normal.tolist())
-
-        # Assistant model
-        assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        assistant.config.pad_token_id = tokenizer.eos_token_id
-
-        # If assisted generation passes model_kwargs correctly, should be same as previous
-        outputs_assisted = model.generate(
-            input_ids,
-            token_type_ids=ops.zeros(input_ids.shape, dtype=mindspore.int64),
-            assistant_model=assistant,
-        )
-        self.assertListEqual(outputs_assisted.tolist(), outputs_tti.tolist())
-
-    def test_model_kwarg_assisted_decoding_encoder_decoder(self):
-        """
-        Tests that the following scenario is compatible with assisted generation:
-        1. encoder-decoder main model
-        2. encoder-decoder assistant model
-        3. both have a custom input
-        (e.g. Whisper)
-        """
-
-        # PT-only test: TF doesn't support assisted decoding yet.
-        # Bart subclass with a kwarg that distorts the output
-        class FakeBart(BartForConditionalGeneration):
-            def forward(self, input_ids, past_key_values, foo=False, **kwargs):
-                outs = super().forward(input_ids, past_key_values=past_key_values, **kwargs)
-                if foo:
-                    outs["logits"][:, :, :] = 0.0
-                return outs
-
-            def prepare_inputs_for_generation(self, *args, foo=False, encoder_outputs=None, **kwargs):
-                kwargs["encoder_outputs"] = encoder_outputs
-                inputs = super().prepare_inputs_for_generation(*args, **kwargs)
-                inputs["foo"] = foo
-                return inputs
-
-        model = FakeBart.from_pretrained("hf-internal-testing/tiny-random-BartForConditionalGeneration")
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BartForConditionalGeneration")
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="ms")
-        input_ids = tokenized_inputs.input_ids
-
-        # Traditional way of generating text
-        outputs_normal = model.generate(input_ids)
-        self.assertEqual(outputs_normal.shape, (1, 20))
-
-        # Should be different with foo
-        outputs_foo = model.generate(input_ids, foo=True)
-        with self.assertRaises(AssertionError):
-            self.assertListEqual(outputs_foo.tolist(), outputs_normal.tolist())
-
-        # Assistant model
-        assistant = FakeBart.from_pretrained("hf-internal-testing/tiny-random-BartForConditionalGeneration")
-        # If assisted generation passes model_kwargs correctly, should be same as previous
-        outputs_assisted = model.generate(
-            input_ids,
-            foo=True,
-            assistant_model=assistant,
-        )
-        self.assertListEqual(outputs_assisted.tolist(), outputs_foo.tolist())
-
-        # Check that passing encoder_outputs directly also works as expected
-        encoder_outputs = assistant.get_encoder()(input_ids)
-
-        outputs_assisted = model.generate(
-            foo=True,
-            assistant_model=assistant,
-            encoder_outputs=encoder_outputs,
-            assistant_encoder_outputs=encoder_outputs,
-        )
-        self.assertListEqual(outputs_assisted.tolist(), outputs_foo.tolist())
-
-    def test_assisted_decoding_encoder_decoder_shared_encoder(self):
-        """
-        Tests that the following scenario is compatible with assisted generation:
-        1. encoder-decoder main model
-        2. decoder-only assistant model
-        3. both have a custom input
-        (e.g. DistilWhisper)
-        """
-
-        # PT-only test: TF doesn't support assisted decoding yet.
-        # Bart subclass with a kwarg called foo that distorts the output
-        class FakeBartSeq2Seq(BartForConditionalGeneration):
-            def forward(self, input_ids, foo=False, **kwargs):
-                outs = super().forward(input_ids, **kwargs)
-                if foo:
-                    outs["logits"][:, :, :] = 0.0
-                return outs
-
-            def prepare_inputs_for_generation(self, *args, foo=False, encoder_outputs=None, **kwargs):
-                kwargs["encoder_outputs"] = encoder_outputs
-                inputs = super().prepare_inputs_for_generation(*args, **kwargs)
-                inputs["foo"] = foo
-                return inputs
-
-        class FakeBartCausalLM(BartForCausalLM):
-            def forward(self, input_ids, attention_mask, past_key_values, foo=False, **kwargs):
-                outs = super().forward(input_ids, attention_mask, past_key_values=past_key_values, **kwargs)
-                if foo:
-                    outs["logits"][:, :, :] = 0.0
-                return outs
-
-            def prepare_inputs_for_generation(self, *args, foo=False, encoder_outputs=None, **kwargs):
-                kwargs["encoder_outputs"] = encoder_outputs
-                inputs = super().prepare_inputs_for_generation(*args, **kwargs)
-                inputs["foo"] = foo
-                return inputs
-
-        model = FakeBartSeq2Seq.from_pretrained("hf-internal-testing/tiny-random-BartForConditionalGeneration")
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BartForConditionalGeneration")
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="ms")
-        input_ids = tokenized_inputs.input_ids
-
-        # Traditional way of generating text
-        outputs_normal = model.generate(input_ids)
-        self.assertEqual(outputs_normal.shape, (1, 20))
-
-        # Should be different with foo
-        outputs_foo = model.generate(input_ids, foo=True)
-        with self.assertRaises(AssertionError):
-            self.assertListEqual(outputs_foo.tolist(), outputs_normal.tolist())
-
-        # Assistant model
-        assistant = FakeBartCausalLM.from_pretrained(
-            "hf-internal-testing/tiny-random-BartForConditionalGeneration"
-        )
-
-        # If assisted generation passes model_kwargs correctly, should be same as previous
-        outputs_assisted = model.generate(
-            input_ids,
-            foo=True,
-            assistant_model=assistant,
-        )
-        self.assertListEqual(outputs_assisted.tolist(), outputs_foo.tolist())
-
-        # Check that passing encoder_outputs directly also works as expected
-        encoder_outputs = model.get_encoder()(input_ids)
-
-        outputs_assisted = model.generate(
-            foo=True,
-            assistant_model=assistant,
-            encoder_outputs=encoder_outputs,
-        )
-        self.assertListEqual(outputs_assisted.tolist(), outputs_foo.tolist())
-
-    def test_assisted_decoding_num_assistant_tokens_heuristic_schedule(self):
-        # This test ensures that the assisted generation num_assistant_tokens 'heuristic' schedule works properly.
-
-        prompt = "Alice and Bob"
-        checkpoint = "EleutherAI/pythia-160m-deduped"
-        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-        inputs = tokenizer(prompt, return_tensors="ms")
-
-        model = AutoModelForCausalLM.from_pretrained(checkpoint)
-
-        assistant_model = model
-        assistant_model.generation_config.num_assistant_tokens = 5
-        assistant_model.generation_config.num_assistant_tokens_schedule = "heuristic"
-        generation_kwargs = {
-            "eos_token_id": -1,
-            "max_new_tokens": 5,
-            "do_sample": False,
-            "assistant_model": assistant_model,
-        }
-        model.generate(**inputs, **generation_kwargs)
-        # update_candidate_strategy is called only once and therefore, assistant_model.generation_config.num_assistant_tokens should be either 4 or 7
-        self.assertTrue(assistant_model.generation_config.num_assistant_tokens in (4, 7))
-
-    def test_assisted_decoding_num_assistant_tokens_heuristic_transient_schedule(self):
-        # This test ensures that the assisted generation num_assistant_tokens 'heuristic' schedule works properly.
-
-        prompt = "Alice and Bob"
-        checkpoint = "EleutherAI/pythia-160m-deduped"
-        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-        inputs = tokenizer(prompt, return_tensors="ms")
-
-        model = AutoModelForCausalLM.from_pretrained(checkpoint)
-
-        assistant_model = model
-        assistant_model.generation_config.num_assistant_tokens = 5
-        assistant_model.generation_config.num_assistant_tokens_schedule = "heuristic_transient"
-        generation_kwargs = {
-            "eos_token_id": -1,
-            "max_new_tokens": 5,
-            "do_sample": False,
-            "assistant_model": assistant_model,
-        }
-        model.generate(**inputs, **generation_kwargs)
-        # update_candidate_strategy is called once but assistant_model.generation_config.num_assistant_tokens should stay 5
-        self.assertEqual(assistant_model.generation_config.num_assistant_tokens, 5)
-
-    @slow
-    def test_validate_assistant(self):
-        # Generate a random sample:
-        inputs = np.random.rand(160000)
-
-        # Load a main encoder-decoder model:
-        model_id = "openai/whisper-large-v2"
-        processor = AutoProcessor.from_pretrained(model_id)
-        model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            model_id,
-            low_cpu_mem_usage=True,
-            use_safetensors=True,
-        )
-        model
-
-        # process the input:
-        features = processor(inputs, return_tensors="ms")
-
-        # Load an encoder-decoder assistant with same encoder as the main model:
-        assistant_distil_model_id = "distil-whisper/distil-large-v2"
-        assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained(
-            assistant_distil_model_id,
-            use_safetensors=True,
-        )
-        self.assertTrue(model.generate(**features, assistant_model=assistant_seq_to_seq).sum())
-
-        # Load its decoder only version:
-        assistant_causal_lm = AutoModelForCausalLM.from_pretrained(
-            assistant_distil_model_id,
-            low_cpu_mem_usage=True,
-            use_safetensors=True,
-        )
-        self.assertTrue(model.generate(**features, assistant_model=assistant_causal_lm).sum())
-
-        # Load an encoder-decoder assistant with a different encoder than the main model:
-        assistant_distil_model_id = "openai/whisper-tiny"
-        assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained(
-            assistant_distil_model_id,
-            use_safetensors=True,
-        )
-        self.assertTrue(model.generate(**features, assistant_model=assistant_seq_to_seq).sum())
-
-        # Load its decoder only version:
-        assistant_causal_lm = AutoModelForCausalLM.from_pretrained(
-            assistant_distil_model_id,
-            low_cpu_mem_usage=True,
-            use_safetensors=True,
-        )
-        # It will raise an error as the encoder of the main and assistant model are not compatible:
-        with self.assertRaises(ValueError):
-            model.generate(**features, assistant_model=assistant_causal_lm)
-
-        # Load an encoder-decoder model with a different tokenizer than the main model:
-        assistant_distil_model_id = "hf-internal-testing/tiny-random-SeamlessM4Tv2ForSpeechToText"
-        assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained(
-            assistant_distil_model_id,
-        )
-        # This should raise an error as the main and assistant model don't use the same tokenizer:
-        with self.assertRaises(ValueError):
-            model.generate(**features, assistant_model=assistant_seq_to_seq)
-
-    def test_compare_unprocessed_logit_scores(self):
-        # Get unprocessed logit scores back from model generate function.
-        # Assert that unprocessed logits from generate() are same as those from modal eval()
-
-        # tell model to generate text and return unprocessed/unwarped logit scores
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = "generate yes or no: "
-        input_ids = tokenizer([text], return_tensors="ms").input_ids
-
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-
-        with no_grad():
-            # Get logits for the next token from fwd pass
-            logits_fwd = model(input_ids).logits[:, -1, :][0]
-
-        # Get logits for the next token from generate function
-        outputs = model.generate(
-            input_ids=input_ids,
-            return_dict_in_generate=True,
-            output_logits=True,
-            max_new_tokens=1,
-            do_sample=True,
-        )
-        logits_gen = outputs.logits[0][0]
-
-        # assert that unprocessed logits from generate() are same as those from modal eval()
-        self.assertListEqual(logits_fwd.tolist(), logits_gen.tolist())
-
-    def test_return_unprocessed_logit_scores(self):
-        # tell model to generate text and return unprocessed/unwarped logit scores
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = "generate yes or no: "
-        input_ids = tokenizer([text], return_tensors="ms").input_ids
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-
-        outputs = model.generate(
-            input_ids=input_ids, return_dict_in_generate=True, output_logits=True, max_new_tokens=3
-        )
-
-        # perform dummy check if unpreprocessed logits make sense.
-        # do preselection on high probabilities; find scores of y and n tokens
-        probs_all = nn.functional.softmax(outputs.logits[2][0], dim=-1)
-        indices = ops.argwhere(probs_all > 0.001)
-        indices = indices[:, -1]
-        tokens_max = tokenizer.batch_decode(indices, skip_special_tokens=True)
-        probs_max = probs_all[probs_all > 0.001]
-
-        self.assertTrue(len(indices) >= 2)
-        next_token_dict = {str(t): p for t, p in zip(tokens_max, probs_max)}
-        self.assertTrue("n" in next_token_dict)
-        self.assertTrue("y" in next_token_dict)
-        y_prob = next_token_dict["y"]
-        n_prob = next_token_dict["n"]
-
-        self.assertTrue(y_prob > 0.001 and n_prob > 0.001)
-        self.assertTrue(y_prob <= 1.0 and n_prob <= 1.0)
-
-
-    @slow
-    @require_mindspore
-    def test_assisted_decoding_in_gpu_cpu(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to("cuda")
-        assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
-            "cpu"
-        )
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
-        model.config.pad_token_id = tokenizer.eos_token_id
-        assistant.config.pad_token_id = tokenizer.eos_token_id
-
-        text = "Hello world"
-        tokenized_inputs = tokenizer([text], return_tensors="ms")
-        input_ids = tokenized_inputs.input_ids
-        input_length = input_ids.shape[-1]
-
-        out = model.generate(
-            input_ids,
-            assistant_model=assistant,
-            max_new_tokens=20,
-        )
-        self.assertTrue(input_length <= out.shape[-1] <= input_length + 20)
-
-    def test_special_tokens_fall_back_to_model_default(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
-        test_bos_id = 50
-
-        # Sanity-check: the model has a BOS token set, and the first generated token is a BOS token
-        gen_output = model.generate()
-        self.assertTrue(model.generation_config.bos_token_id is not None)
-        self.assertTrue(model.generation_config.bos_token_id == gen_output[0, 0])
-
-        # If we pass a generation config **with** a BOS token, `generate` will use it
-        generation_config = GenerationConfig(bos_token_id=test_bos_id)
-        gen_output = model.generate(generation_config=generation_config)
-        self.assertFalse(model.generation_config.bos_token_id == gen_output[0, 0])
-        self.assertTrue(generation_config.bos_token_id == gen_output[0, 0])
-        self.assertTrue(test_bos_id == gen_output[0, 0])
-
-        # If we pass a generation config **without** a BOS token, `generate` will fetch the BOS token from
-        # `model.generation_config`
-        generation_config = GenerationConfig(bos_token_id=None)
-        gen_output = model.generate(generation_config=generation_config)
-        self.assertTrue(model.generation_config.bos_token_id == gen_output[0, 0])
-        self.assertFalse(test_bos_id == gen_output[0, 0])
-        self.assertTrue(generation_config.bos_token_id is None)
-
-        # Changing `model.generation_config` will affect fallback behavior
-        model.generation_config.bos_token_id = test_bos_id
-        gen_output = model.generate(generation_config=generation_config)
-        self.assertTrue(model.generation_config.bos_token_id == gen_output[0, 0])
-        self.assertTrue(test_bos_id == gen_output[0, 0])
-        self.assertTrue(generation_config.bos_token_id is None)
-
-
-@require_mindspore
-class TokenHealingTestCase(unittest.TestCase):
-    @parameterized.expand(
-        [
-            (
-                "square_bracket",
-                'An example ["like this"] and another example [',
-                'An example ["like this"] and another example ["',
-            ),
-            ("url", 'The link is <a href="http:', 'The link is <a href="http://'),
-            # aggressive_healing: "http" shouldn't be replaced with "https"
-            ("aggressive_healing", 'The link is <a href="http', 'The link is <a href="http'),
-            ("trailing_whitespace", "I read a book about ", "I read a book about"),
-            ("nothing_to_heal", "I read a book about", "I read a book about"),
-            ("single_token", "I", "I"),
-            ("empty_prompt", "", ""),
-        ]
-    )
-    # @require_auto_gptq
-    # def test_prompts(self, name, input, expected):
-    #     model_name_or_path = "TheBloke/deepseek-llm-7B-base-GPTQ"
-    #     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
-    #     completion_model = AutoModelForCausalLM.from_pretrained(
-    #         model_name_or_path,
-    #         device_map="auto",
-    #         trust_remote_code=False,
-    #         revision="main",
-    #         use_cache=True,
-    #     )
-    #     input_ids = tokenizer(input, return_tensors="ms").input_ids.to(completion_model.device)
-
-    #     healed_ids = completion_model.heal_tokens(input_ids)
-    #     predicted = tokenizer.decode(healed_ids[0], skip_special_tokens=True)
-
-    #     self.assertEqual(predicted, expected)
-
-    def test_generate_from_inputs_embeds_with_bos_token_id_is_none(self):
-        article = "Today a dragon flew over Paris."
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        input_ids = tokenizer(article, return_tensors="ms").input_ids
-        inputs_embeds = model.get_input_embeddings()(input_ids)
-
-        model.generate(inputs_embeds=inputs_embeds, max_length=20, bos_token_id=None)
-
-        # bos_token_id is required when no input ids nor inputs_embeds is passed
-        with self.assertRaises(ValueError):
-            model.generate(max_length=20, bos_token_id=None)
\ No newline at end of file
diff --git a/tests/transformers/models/__init__.py b/tests/transformers/models/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/albert/__init__.py b/tests/transformers/models/albert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/albert/test_modeling_albert.py b/tests/transformers/models/albert/test_modeling_albert.py
deleted file mode 100644
index 58cb809a1..000000000
--- a/tests/transformers/models/albert/test_modeling_albert.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-import numpy as np
-from mindnlp.transformers import AlbertConfig
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import require_mindspore, is_mindspore_available, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        AlbertForMaskedLM,
-        AlbertForMultipleChoice,
-        AlbertForPreTraining,
-        AlbertForQuestionAnswering,
-        AlbertForSequenceClassification,
-        AlbertForTokenClassification,
-        AlbertModel,
-    )
-    from mindnlp.transformers.models.albert.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST
-
-class AlbertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        embedding_size=16,
-        hidden_size=36,
-        num_hidden_layers=2,
-        # this needs to be the same as `num_hidden_layers`!
-        num_hidden_groups=2,
-        num_attention_heads=6,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.embedding_size = embedding_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_hidden_groups = num_hidden_groups
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return AlbertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            num_hidden_groups=self.num_hidden_groups,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = AlbertModel(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_pretraining(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = AlbertForPreTraining(config=config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            sentence_order_label=sequence_labels,
-        )
-        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertEqual(result.sop_logits.shape, (self.batch_size, config.num_labels))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = AlbertForMaskedLM(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = AlbertForQuestionAnswering(config=config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = AlbertForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = AlbertForTokenClassification(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = AlbertForMultipleChoice(config=config)
-
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            AlbertForPreTraining,
-            AlbertForMaskedLM,
-            AlbertForMultipleChoice,
-            AlbertForSequenceClassification,
-            AlbertForTokenClassification,
-            AlbertForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": AlbertModel,
-            "fill-mask": AlbertForMaskedLM,
-            "question-answering": AlbertForQuestionAnswering,
-            "text-classification": AlbertForSequenceClassification,
-            "token-classification": AlbertForTokenClassification,
-            "zero-shot": AlbertForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = True
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = ops.zeros(
-                    self.model_tester.batch_size, self.model_tester.seq_length, dtype=mindspore.int64)
-                inputs_dict["sentence_order_label"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64)
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = AlbertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = AlbertModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-@require_mindspore
-class AlbertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        model = AlbertModel.from_pretrained("albert-base-v2")
-        input_ids = mindspore.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = mindspore.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = (1, 11, 768)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[[-0.6513, 1.5035, -0.2766], [-0.6515, 1.5046, -0.2780], [-0.6512, 1.5049, -0.2784]]]
-        )
-        print(output[:, 1:4, 1:4].asnumpy())
-        self.assertTrue(np.allclose(output[:, 1:4, 1:4].asnumpy(), expected_slice.asnumpy(), atol=1e-3))
diff --git a/tests/transformers/models/albert/test_tokenization_albert.py b/tests/transformers/models/albert/test_tokenization_albert.py
deleted file mode 100644
index ab19ea737..000000000
--- a/tests/transformers/models/albert/test_tokenization_albert.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# coding=utf-8
-# Copyright 2019 Hugging Face inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from mindnlp.transformers import AlbertTokenizer, AlbertTokenizerFast
-from mindnlp.utils.testing_utils import get_tests_dir, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model")
-
-
-class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    tokenizer_class = AlbertTokenizer
-    rust_tokenizer_class = AlbertTokenizerFast
-    test_rust_tokenizer = True
-    test_sentencepiece = True
-    test_sentencepiece_ignore_case = True
-
-    def setUp(self):
-        super().setUp()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "this is a test"
-        output_text = "this is a test"
-        return input_text, output_text
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = "<pad>"
-        token_id = 0
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "<pad>")
-        self.assertEqual(vocab_keys[1], "<unk>")
-        self.assertEqual(vocab_keys[-1], "▁eloquent")
-        self.assertEqual(len(vocab_keys), 30_000)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "I was born in 92000, and this is falsé."
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_full_tokenizer(self):
-        tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])
-
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens, ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."]
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "."],
-        )
-
-    def test_sequence_builders(self):
-        tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
-
-        text = tokenizer.encode("sequence builders")
-        text_2 = tokenizer.encode("multi-sequence build")
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
-        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
-            tokenizer.sep_token_id
-        ]
-
-    @slow
-    def test_tokenizer_integration(self):
-        # fmt: off
-        expected_encoding = {'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'input_ids': [[2, 21970, 13, 5, 6092, 167, 28, 7103, 2153, 673, 8, 7028, 12051, 18, 17, 7103, 2153, 673, 8, 3515, 18684, 8, 4461, 6, 1927, 297, 8, 12060, 2607, 18, 13, 5, 4461, 15, 10538, 38, 8, 135, 15, 822, 58, 15, 993, 10363, 15, 1460, 8005, 4461, 15, 993, 255, 2328, 9, 9, 9, 6, 26, 1112, 816, 3260, 13, 5, 103, 2377, 6, 17, 1112, 816, 2782, 13, 5, 103, 10641, 6, 29, 84, 2512, 2430, 782, 18684, 2761, 19, 808, 2430, 2556, 17, 855, 1480, 9477, 4091, 128, 11712, 15, 7103, 2153, 673, 17, 24883, 9990, 9, 3], [2, 11502, 25, 1006, 20, 782, 8, 11809, 855, 1732, 19393, 18667, 37, 367, 21018, 69, 1854, 34, 11860, 19124, 27, 156, 225, 17, 193, 4141, 19, 65, 9124, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 14, 2231, 886, 2385, 17659, 84, 14, 16792, 1952, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
-        # fmt: on
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="albert-base-v2",
-            revision="6b6560eaf5ff2e250b00c50f380c5389a9c2d82e",
-        )
\ No newline at end of file
diff --git a/tests/transformers/models/align/__init__.py b/tests/transformers/models/align/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/align/test_modeling_align.py b/tests/transformers/models/align/test_modeling_align.py
deleted file mode 100644
index 0ee377b6d..000000000
--- a/tests/transformers/models/align/test_modeling_align.py
+++ /dev/null
@@ -1,571 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch ALIGN model. """
-
-
-import inspect
-import tempfile
-import unittest
-
-import requests
-
-import numpy as np
-from mindnlp.transformers import AlignConfig, AlignProcessor, AlignTextConfig, AlignVisionConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import (
-        AlignModel,
-        AlignTextModel,
-        AlignVisionModel,
-    )
-    from mindnlp.transformers.models.align.modeling_align import ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
-
-if is_vision_available():
-    from PIL import Image
-
-
-class AlignVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=32,
-        num_channels=3,
-        kernel_sizes=[3, 3, 5],
-        in_channels=[32, 16, 24],
-        out_channels=[16, 24, 30],
-        hidden_dim=64,
-        strides=[1, 1, 2],
-        num_block_repeats=[1, 1, 2],
-        expand_ratios=[1, 6, 6],
-        is_training=True,
-        hidden_act="gelu",
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.kernel_sizes = kernel_sizes
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.hidden_dim = hidden_dim
-        self.strides = strides
-        self.num_block_repeats = num_block_repeats
-        self.expand_ratios = expand_ratios
-        self.is_training = is_training
-        self.hidden_act = hidden_act
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return AlignVisionConfig(
-            num_channels=self.num_channels,
-            kernel_sizes=self.kernel_sizes,
-            in_channels=self.in_channels,
-            out_channels=self.out_channels,
-            hidden_dim=self.hidden_dim,
-            strides=self.strides,
-            num_block_repeats=self.num_block_repeats,
-            expand_ratios=self.expand_ratios,
-            hidden_act=self.hidden_act,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = AlignVisionModel(config=config)
-
-        model.set_train(False)
-        result = model(pixel_values)
-
-        patch_size = self.image_size // 4
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, config.hidden_dim, patch_size, patch_size)
-        )
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, config.hidden_dim))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class AlignVisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ALIGN does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (AlignVisionModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = AlignVisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=AlignVisionConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    @unittest.skip(reason="AlignVisionModel does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="AlignVisionModel does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(reason="AlignVisionModel does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-    
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-            num_blocks = sum(config.num_block_repeats) * 4
-            self.assertEqual(len(hidden_states), num_blocks)
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 2, self.model_tester.image_size // 2],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    @unittest.skip
-    def test_training(self):
-        pass
-
-    @unittest.skip
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = AlignVisionModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-class AlignTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask
-
-    def get_config(self):
-        return AlignTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, token_type_ids, input_mask):
-        model = AlignTextModel(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class AlignTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (AlignTextModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = AlignTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=AlignTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-    @unittest.skip
-    def test_training(self):
-        pass
-
-    @unittest.skip
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="ALIGN does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Align does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(reason="AlignTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="AlignTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = AlignTextModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-class AlignModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = AlignTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = AlignVisionModelTester(parent, **vision_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        test_config, input_ids, token_type_ids, input_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, pixel_values
-
-    def get_config(self):
-        return AlignConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, token_type_ids, attention_mask, pixel_values):
-        model = AlignModel(config).set_train(False)
-        result = model(input_ids, pixel_values, attention_mask, token_type_ids)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, input_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-            "pixel_values": pixel_values,
-            "return_loss": True,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class AlignModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (AlignModel,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"feature-extraction": AlignModel} if is_mindspore_available() else {}
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = AlignModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Align does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="AlignModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # override as the `temperature` parameter initilization is different for ALIGN
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    # check if `temperature` is initilized as per the original implementation
-                    if name == "temperature":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            1.0,
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif name == "text_projection.weight":
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save AlignConfig and check if we can load AlignVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = AlignVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save AlignConfig and check if we can load AlignTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = AlignTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = AlignModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_vision
-@require_mindspore
-class AlignModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "kakaobrain/align-base"
-        model = AlignModel.from_pretrained(model_name)
-        processor = AlignProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        texts = ["a photo of a cat", "a photo of a dog"]
-        inputs = processor(text=texts, images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(
-            outputs.logits_per_image.shape,
-            (inputs.pixel_values.shape[0], inputs.input_ids.shape[0]),
-        )
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            (inputs.input_ids.shape[0], inputs.pixel_values.shape[0]),
-        )
-        expected_logits = mindspore.tensor([[9.7093, 3.4679]])
-        print(outputs.logits_per_image.asnumpy())
-        self.assertTrue(np.allclose(outputs.logits_per_image.asnumpy(), expected_logits.asnumpy(), atol=1e-3))
diff --git a/tests/transformers/models/altclip/__init__.py b/tests/transformers/models/altclip/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/altclip/test_modeling_altclip.py b/tests/transformers/models/altclip/test_modeling_altclip.py
deleted file mode 100644
index d0b86db6d..000000000
--- a/tests/transformers/models/altclip/test_modeling_altclip.py
+++ /dev/null
@@ -1,535 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore AltCLIP model. """
-
-
-import inspect
-import unittest
-
-import numpy as np
-import requests
-
-from mindnlp.transformers import AltCLIPConfig, AltCLIPProcessor, AltCLIPTextConfig, AltCLIPVisionConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import AltCLIPModel, AltCLIPTextModel, AltCLIPVisionModel
-    from mindnlp.transformers.models.altclip.modeling_altclip import ALTCLIP_PRETRAINED_MODEL_ARCHIVE_LIST
-
-if is_vision_available():
-    from PIL import Image
-
-
-class AltCLIPVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return AltCLIPVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = AltCLIPVisionModel(config=config)
-
-        model.set_train(False)
-        result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class AltCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (AltCLIPVisionModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = AltCLIPVisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=AltCLIPVisionConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="CLIP does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(reason="CLIPModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="AltCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="AltCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(reason="AltCLIPVisionModel use the same cv backbone with CLIP model.")
-    def test_model_from_pretrained(self):
-        pass
-
-
-class AltCLIPTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        projection_dim=32,
-        project_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.project_dim = project_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                # input_mask[batch_idx, :start_index] = 1
-                # input_mask[batch_idx, start_index:] = 0
-                ops.scatter_nd_update(input_mask,
-                                      ops.stack([ops.full((int(start_index),), batch_idx, dtype=mindspore.int64), ops.arange(int(start_index))], dim=1),
-                                      ops.full((int(start_index),), 1, dtype=mindspore.int64))
-                ops.scatter_nd_update(input_mask,
-                                      ops.stack([ops.full((input_mask.shape[1] - int(start_index),), batch_idx, dtype=mindspore.int64), ops.arange(int(input_mask.shape[1] - start_index))], dim=1),
-                                      ops.full((input_mask.shape[1] - int(start_index),), 0, dtype=mindspore.int64))
-
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return AltCLIPTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            project_dim=self.project_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            pad_token_id=1,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = AltCLIPTextModel(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class AltCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (AltCLIPTextModel,) if is_mindspore_available() else ()
-    fx_compatible = True
-    test_pruning = False
-    test_head_masking = False
-
-    # TODO (@SunMarc): Fix me
-    @unittest.skip("It's broken.")
-    def test_resize_tokens_embeddings(self):
-        super().test_resize_tokens_embeddings()
-
-    def setUp(self):
-        self.model_tester = AltCLIPTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=AltCLIPTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    def test_model_outputs_equivalence(self):
-        pass
-
-    @unittest.skip(reason="Result of the model is a dict")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="AltCLIP does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="AltCLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="AltCLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ALTCLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = AltCLIPTextModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-class AltCLIPModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = AltCLIPTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = AltCLIPVisionModelTester(parent, **vision_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return AltCLIPConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = AltCLIPModel(config=config)
-
-        model.set_train(False)
-
-        model(input_ids, pixel_values, attention_mask)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "return_loss": True,
-        }
-        return config, inputs_dict
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_mindspore
-class AltCLIPModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (AltCLIPModel,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"feature-extraction": AltCLIPModel} if is_mindspore_available() else {}
-    fx_compatible = True
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "FeatureExtractionPipelineTests":
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = AltCLIPModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="CLIPModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="CLIPModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # override as the `logit_scale` parameter initilization is different for AltCLIP
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ALTCLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = AltCLIPModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-@require_vision
-@require_mindspore
-class AltCLIPModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "BAAI/AltCLIP"
-        model = AltCLIPModel.from_pretrained(model_name)
-        processor = AltCLIPProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        inputs = processor(text=["一张猫的照片", "一张狗的照片"], images=image, padding=True, return_tensors="ms")  # fmt: skip
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(
-            outputs.logits_per_image.shape,
-            (inputs.pixel_values.shape[0], inputs.input_ids.shape[0]),
-        )
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            (inputs.input_ids.shape[0], inputs.pixel_values.shape[0]),
-        )
-
-        probs = ops.softmax(outputs.logits_per_image, dim=1)
-        expected_probs = mindspore.tensor([[9.9942e-01, 5.7805e-04]])
-
-        self.assertTrue(np.allclose(probs.asnumpy(), expected_probs.asnumpy(), atol=5e-3))
diff --git a/tests/transformers/models/audio_spectrogram_transformer/__init__.py b/tests/transformers/models/audio_spectrogram_transformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/transformers/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
deleted file mode 100644
index 2a7b98eec..000000000
--- a/tests/transformers/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindNLP Audio Spectrogram Transformer (AST) model. """
-
-import inspect
-import unittest
-
-import numpy as np
-from huggingface_hub import hf_hub_download
-import soundfile
-
-from mindnlp.transformers import ASTConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow
-from mindnlp.utils import cached_property, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn
-
-    from mindnlp.transformers import ASTForAudioClassification, ASTModel
-    from mindnlp.transformers.models.audio_spectrogram_transformer.modeling_audio_spectrogram_transformer import (
-        AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
-    from mindnlp.transformers import ASTFeatureExtractor
-
-class ASTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        patch_size=2,
-        max_length=24,
-        num_mel_bins=16,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        scope=None,
-        frequency_stride=2,
-        time_stride=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.patch_size = patch_size
-        self.max_length = max_length
-        self.num_mel_bins = num_mel_bins
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.frequency_stride = frequency_stride
-        self.time_stride = time_stride
-
-        # in AST, the seq length equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
-        frequency_out_dimension = (self.num_mel_bins - self.patch_size) // self.frequency_stride + 1
-        time_out_dimension = (self.max_length - self.patch_size) // self.time_stride + 1
-        num_patches = frequency_out_dimension * time_out_dimension
-        self.seq_length = num_patches + 2
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.max_length, self.num_mel_bins])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, input_values, labels
-
-    def get_config(self):
-        return ASTConfig(
-            patch_size=self.patch_size,
-            max_length=self.max_length,
-            num_mel_bins=self.num_mel_bins,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            frequency_stride=self.frequency_stride,
-            time_stride=self.time_stride,
-        )
-
-    def create_and_check_model(self, config, input_values, labels):
-        model = ASTModel(config=config)
-        model.set_train(False)
-        result = model(input_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_values,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_values": input_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ASTModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as AST does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            ASTModel,
-            ASTForAudioClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"audio-classification": ASTForAudioClassification, "feature-extraction": ASTModel}
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "AudioClassificationPipelineTests":
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = ASTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ASTConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    @unittest.skip(reason="AST does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = ASTModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-# We will verify our results on some audio from AudioSet
-def prepare_audio():
-    filepath = hf_hub_download(
-        repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset",
-        endpoint='https://hf-mirror.com'
-    )
-    audio, sampling_rate = soundfile.read(filepath)
-
-    return audio, sampling_rate
-
-
-@require_mindspore
-class ASTModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_feature_extractor(self):
-        return ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
-
-    def test_inference_audio_classification(self):
-        feature_extractor = self.default_feature_extractor
-        model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
-
-        feature_extractor = self.default_feature_extractor
-        audio, sampling_rate = prepare_audio()
-        audio = audio.squeeze()
-        print(audio.shape)
-        inputs = feature_extractor(audio, sampling_rate=sampling_rate, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 527)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-0.8760, -7.0042, -8.6602])
-        print(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy())
-        self.assertTrue(np.allclose(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/auto/__init__.py b/tests/transformers/models/auto/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/auto/test_configuration_auto.py b/tests/transformers/models/auto/test_configuration_auto.py
deleted file mode 100644
index 46982e2ae..000000000
--- a/tests/transformers/models/auto/test_configuration_auto.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import importlib
-import json
-import os
-import sys
-import tempfile
-import unittest
-from pathlib import Path
-
-import mindnlp.transformers.models.auto
-from mindnlp.transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
-from mindnlp.transformers.models.bert.configuration_bert import BertConfig
-from mindnlp.transformers.models.roberta.configuration_roberta import RobertaConfig
-from mindnlp.utils.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, get_tests_dir
-
-
-sys.path.append(str(Path(__file__).parent.parent.parent.parent))
-
-from test_module.custom_configuration import CustomConfig  # noqa E402
-
-
-SAMPLE_ROBERTA_CONFIG = get_tests_dir("fixtures/dummy-config.json")
-
-
-class AutoConfigTest(unittest.TestCase):
-    def test_module_spec(self):
-        self.assertIsNotNone(mindnlp.transformers.models.auto.__spec__)
-        self.assertIsNotNone(importlib.util.find_spec("mindnlp.transformers.models.auto"))
-
-    def test_config_from_model_shortcut(self):
-        config = AutoConfig.from_pretrained("bert-base-uncased")
-        self.assertIsInstance(config, BertConfig)
-
-    def test_config_model_type_from_local_file(self):
-        config = AutoConfig.from_pretrained(SAMPLE_ROBERTA_CONFIG)
-        self.assertIsInstance(config, RobertaConfig)
-
-    def test_config_model_type_from_model_identifier(self):
-        config = AutoConfig.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
-        self.assertIsInstance(config, RobertaConfig)
-
-    def test_config_for_model_str(self):
-        config = AutoConfig.for_model("roberta")
-        self.assertIsInstance(config, RobertaConfig)
-
-    def test_pattern_matching_fallback(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # This model name contains bert and roberta, but roberta ends up being picked.
-            folder = os.path.join(tmp_dir, "fake-roberta")
-            os.makedirs(folder, exist_ok=True)
-            with open(os.path.join(folder, "config.json"), "w") as f:
-                f.write(json.dumps({}))
-            config = AutoConfig.from_pretrained(folder)
-            self.assertEqual(type(config), RobertaConfig)
-
-    def test_new_config_registration(self):
-        try:
-            AutoConfig.register("custom", CustomConfig)
-            # Wrong model type will raise an error
-            with self.assertRaises(ValueError):
-                AutoConfig.register("model", CustomConfig)
-            # Trying to register something existing in the Transformers library will raise an error
-            with self.assertRaises(ValueError):
-                AutoConfig.register("bert", BertConfig)
-
-            # Now that the config is registered, it can be used as any other config with the auto-API
-            config = CustomConfig()
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                config.save_pretrained(tmp_dir)
-                new_config = AutoConfig.from_pretrained(tmp_dir)
-                self.assertIsInstance(new_config, CustomConfig)
-
-        finally:
-            if "custom" in CONFIG_MAPPING._extra_content:
-                del CONFIG_MAPPING._extra_content["custom"]
diff --git a/tests/transformers/models/auto/test_modeling_auto.py b/tests/transformers/models/auto/test_modeling_auto.py
deleted file mode 100644
index 55cb708fa..000000000
--- a/tests/transformers/models/auto/test_modeling_auto.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import sys
-import tempfile
-import unittest
-from collections import OrderedDict
-from pathlib import Path
-
-import pytest
-
-import mindnlp
-from mindnlp.transformers import BertConfig, GPT2Model
-from mindnlp.transformers.models.auto.configuration_auto import CONFIG_MAPPING
-from mindnlp.utils.testing_utils import (
-    DUMMY_UNKNOWN_IDENTIFIER,
-    SMALL_MODEL_IDENTIFIER,
-    RequestCounter,
-    require_mindspore,
-    slow,
-)
-from mindnlp.utils import is_safetensors_available, is_mindspore_available
-from ..bert.test_modeling_bert import BertModelTester
-
-
-from ....test_module.custom_configuration import CustomConfig  # noqa E402
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-    from ....test_module.custom_modeling import CustomModel
-
-    from mindnlp.transformers import (
-        AutoBackbone,
-        AutoConfig,
-        AutoModel,
-        AutoModelForCausalLM,
-        AutoModelForMaskedLM,
-        AutoModelForPreTraining,
-        AutoModelForQuestionAnswering,
-        AutoModelForSeq2SeqLM,
-        AutoModelForSequenceClassification,
-        AutoModelForTableQuestionAnswering,
-        AutoModelForTokenClassification,
-        AutoModelWithLMHead,
-        BertForMaskedLM,
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BertForTokenClassification,
-        BertModel,
-        FunnelBaseModel,
-        FunnelModel,
-        GPT2Config,
-        GPT2LMHeadModel,
-        ResNetBackbone,
-        RobertaForMaskedLM,
-        T5Config,
-        T5ForConditionalGeneration,
-        TapasConfig,
-        TapasForQuestionAnswering,
-    )
-    from mindnlp.transformers.models.auto.modeling_auto import (
-        MODEL_FOR_CAUSAL_LM_MAPPING,
-        MODEL_FOR_MASKED_LM_MAPPING,
-        MODEL_FOR_PRETRAINING_MAPPING,
-        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        MODEL_MAPPING,
-    )
-
-
-@require_mindspore
-class AutoModelTest(unittest.TestCase):
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google-bert/bert-base-uncased"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, BertConfig)
-
-        model = AutoModel.from_pretrained(model_name)
-        model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, BertModel)
-
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-        # When using PyTorch checkpoint, the expected value is `8`. With `safetensors` checkpoint (if it is
-        # installed), the expected value becomes `7`.
-        EXPECTED_NUM_OF_UNEXPECTED_KEYS = 7 if is_safetensors_available() else 8
-        self.assertEqual(len(loading_info["unexpected_keys"]), EXPECTED_NUM_OF_UNEXPECTED_KEYS)
-        self.assertEqual(len(loading_info["mismatched_keys"]), 0)
-        self.assertEqual(len(loading_info["error_msgs"]), 0)
-
-    @slow
-    def test_model_for_pretraining_from_pretrained(self):
-        model_name = "google-bert/bert-base-uncased"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, BertConfig)
-
-        model = AutoModelForPreTraining.from_pretrained(model_name)
-        model, loading_info = AutoModelForPreTraining.from_pretrained(model_name, output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, BertForPreTraining)
-        # Only one value should not be initialized and in the missing keys.
-        for key, value in loading_info.items():
-            self.assertEqual(len(value), 0)
-
-    @slow
-    def test_lmhead_model_from_pretrained(self):
-        model_name = "google-bert/bert-base-uncased"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, BertConfig)
-
-        model = AutoModelWithLMHead.from_pretrained(model_name)
-        model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, BertForMaskedLM)
-
-    @slow
-    def test_model_for_causal_lm(self):
-        model_name = "openai-community/gpt2"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, GPT2Config)
-
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        model, loading_info = AutoModelForCausalLM.from_pretrained(model_name, output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, GPT2LMHeadModel)
-
-    @slow
-    def test_model_for_masked_lm(self):
-        model_name = "google-bert/bert-base-uncased"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, BertConfig)
-
-        model = AutoModelForMaskedLM.from_pretrained(model_name)
-        model, loading_info = AutoModelForMaskedLM.from_pretrained(model_name, output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, BertForMaskedLM)
-
-    @slow
-    def test_model_for_encoder_decoder_lm(self):
-        model_name = "google-t5/t5-base"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, T5Config)
-
-        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-        model, loading_info = AutoModelForSeq2SeqLM.from_pretrained(model_name, output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, T5ForConditionalGeneration)
-
-    @slow
-    def test_sequence_classification_model_from_pretrained(self):
-        model_name = "google-bert/bert-base-uncased"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, BertConfig)
-
-        model = AutoModelForSequenceClassification.from_pretrained(model_name)
-        model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, BertForSequenceClassification)
-
-    @slow
-    def test_question_answering_model_from_pretrained(self):
-        model_name = "google-bert/bert-base-uncased"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, BertConfig)
-
-        model = AutoModelForQuestionAnswering.from_pretrained(model_name)
-        model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, BertForQuestionAnswering)
-
-    @slow
-    def test_table_question_answering_model_from_pretrained(self):
-        model_name = "google/tapas-base"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, TapasConfig)
-
-        model = AutoModelForTableQuestionAnswering.from_pretrained(model_name)
-        model, loading_info = AutoModelForTableQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, TapasForQuestionAnswering)
-
-    @slow
-    def test_token_classification_model_from_pretrained(self):
-        model_name = "google-bert/bert-base-uncased"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, BertConfig)
-
-        model = AutoModelForTokenClassification.from_pretrained(model_name)
-        model, loading_info = AutoModelForTokenClassification.from_pretrained(model_name, output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, BertForTokenClassification)
-
-    @slow
-    def test_auto_backbone_from_pretrained(self):
-        model = AutoBackbone.from_pretrained("microsoft/resnet-18")
-        model, loading_info = AutoBackbone.from_pretrained("microsoft/resnet-18", output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, ResNetBackbone)
-
-        # Check kwargs are correctly passed to the backbone
-        model = AutoBackbone.from_pretrained("microsoft/resnet-18", out_indices=[-2, -1])
-        self.assertEqual(model.out_indices, [-2, -1])
-        self.assertEqual(model.out_features, ["stage3", "stage4"])
-
-        model = AutoBackbone.from_pretrained("microsoft/resnet-18", out_features=["stage2", "stage4"])
-        self.assertEqual(model.out_indices, [2, 4])
-        self.assertEqual(model.out_features, ["stage2", "stage4"])
-
-    def test_from_pretrained_identifier(self):
-        model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
-        self.assertIsInstance(model, BertForMaskedLM)
-        self.assertEqual(model.num_parameters(), 14410)
-        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
-
-    def test_from_identifier_from_model_type(self):
-        model = AutoModelWithLMHead.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
-        self.assertIsInstance(model, RobertaForMaskedLM)
-        self.assertEqual(model.num_parameters(), 14410)
-        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
-
-    def test_from_pretrained_with_tuple_values(self):
-        # For the auto model mapping, FunnelConfig has two models: FunnelModel and FunnelBaseModel
-        model = AutoModel.from_pretrained("sgugger/funnel-random-tiny")
-        self.assertIsInstance(model, FunnelModel)
-
-        config = copy.deepcopy(model.config)
-        config.architectures = ["FunnelBaseModel"]
-        model = AutoModel.from_config(config)
-        self.assertIsInstance(model, FunnelBaseModel)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir)
-            model = AutoModel.from_pretrained(tmp_dir)
-            self.assertIsInstance(model, FunnelBaseModel)
-
-    def test_new_model_registration(self):
-        AutoConfig.register("custom", CustomConfig)
-
-        auto_classes = [
-            AutoModel,
-            AutoModelForCausalLM,
-            AutoModelForMaskedLM,
-            AutoModelForPreTraining,
-            AutoModelForQuestionAnswering,
-            AutoModelForSequenceClassification,
-            AutoModelForTokenClassification,
-        ]
-
-        try:
-            for auto_class in auto_classes:
-                with self.subTest(auto_class.__name__):
-                    # Wrong config class will raise an error
-                    with self.assertRaises(ValueError):
-                        auto_class.register(BertConfig, CustomModel)
-                    auto_class.register(CustomConfig, CustomModel)
-                    # Trying to register something existing in the Transformers library will raise an error
-                    with self.assertRaises(ValueError):
-                        auto_class.register(BertConfig, BertModel)
-
-                    # Now that the config is registered, it can be used as any other config with the auto-API
-                    tiny_config = BertModelTester(self).get_config()
-                    config = CustomConfig(**tiny_config.to_dict())
-                    model = auto_class.from_config(config)
-                    self.assertIsInstance(model, CustomModel)
-
-                    with tempfile.TemporaryDirectory() as tmp_dir:
-                        model.save_pretrained(tmp_dir)
-                        new_model = auto_class.from_pretrained(tmp_dir)
-                        # The model is a CustomModel but from the new dynamically imported class.
-                        self.assertIsInstance(new_model, CustomModel)
-
-        finally:
-            if "custom" in CONFIG_MAPPING._extra_content:
-                del CONFIG_MAPPING._extra_content["custom"]
-            for mapping in (
-                MODEL_MAPPING,
-                MODEL_FOR_PRETRAINING_MAPPING,
-                MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-                MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-                MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-                MODEL_FOR_CAUSAL_LM_MAPPING,
-                MODEL_FOR_MASKED_LM_MAPPING,
-            ):
-                if CustomConfig in mapping._extra_content:
-                    del mapping._extra_content[CustomConfig]
-
-    # def test_repo_not_found(self):
-    #     with self.assertRaisesRegex(
-    #         EnvironmentError, "bert-base is not a local folder and is not a valid model identifier"
-    #     ):
-    #         _ = AutoModel.from_pretrained("bert-base")
-
-    # def test_revision_not_found(self):
-    #     with self.assertRaisesRegex(
-    #         EnvironmentError, r"aaaaaa is not a valid git identifier \(branch name, tag name or commit id\)"
-    #     ):
-    #         _ = AutoModel.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
-
-    # def test_model_file_not_found(self):
-    #     with self.assertRaisesRegex(
-    #         EnvironmentError,
-    #         "hf-internal-testing/config-no-model does not appear to have a file named mindspore_model.ckpt, model.safetensors.",
-    #     ):
-    #         _ = AutoModel.from_pretrained("hf-internal-testing/config-no-model")
-
-    # def test_cached_model_has_minimum_calls_to_head(self):
-    #     # Make sure we have cached the model.
-    #     _ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-    #     with RequestCounter() as counter:
-    #         _ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-    #     self.assertEqual(counter["GET"], 0)
-    #     self.assertEqual(counter["HEAD"], 1)
-    #     self.assertEqual(counter.total_calls, 1)
-
-    #     # With a sharded checkpoint
-    #     _ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
-    #     with RequestCounter() as counter:
-    #         _ = AutoModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded")
-    #     self.assertEqual(counter["GET"], 0)
-    #     self.assertEqual(counter["HEAD"], 1)
-    #     self.assertEqual(counter.total_calls, 1)
-
-    def test_attr_not_existing(self):
-        from mindnlp.transformers.models.auto.auto_factory import _LazyAutoMapping
-
-        _CONFIG_MAPPING_NAMES = OrderedDict([("bert", "BertConfig")])
-        _MODEL_MAPPING_NAMES = OrderedDict([("bert", "GhostModel")])
-        _MODEL_MAPPING = _LazyAutoMapping(_CONFIG_MAPPING_NAMES, _MODEL_MAPPING_NAMES)
-
-        with pytest.raises(ValueError, match=r"Could not find GhostModel neither in .* nor in .*!"):
-            _MODEL_MAPPING[BertConfig]
-
-        _MODEL_MAPPING_NAMES = OrderedDict([("bert", "BertModel")])
-        _MODEL_MAPPING = _LazyAutoMapping(_CONFIG_MAPPING_NAMES, _MODEL_MAPPING_NAMES)
-        self.assertEqual(_MODEL_MAPPING[BertConfig], BertModel)
-
-        _MODEL_MAPPING_NAMES = OrderedDict([("bert", "GPT2Model")])
-        _MODEL_MAPPING = _LazyAutoMapping(_CONFIG_MAPPING_NAMES, _MODEL_MAPPING_NAMES)
-        self.assertEqual(_MODEL_MAPPING[BertConfig], GPT2Model)
\ No newline at end of file
diff --git a/tests/transformers/models/autoformer/__init__.py b/tests/transformers/models/autoformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/autoformer/test_modeling_autoformer.py b/tests/transformers/models/autoformer/test_modeling_autoformer.py
deleted file mode 100644
index 51c3b6d4a..000000000
--- a/tests/transformers/models/autoformer/test_modeling_autoformer.py
+++ /dev/null
@@ -1,489 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the Mindspore Autoformer model. """
-
-import inspect
-import tempfile
-import unittest
-
-from huggingface_hub import hf_hub_download
-
-from mindnlp.utils.testing_utils import is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-TOLERANCE = 1e-4
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-    from mindnlp.transformers import AutoformerConfig, AutoformerForPrediction, AutoformerModel
-
-    from mindnlp.transformers.models.autoformer.modeling_autoformer import AutoformerDecoder, AutoformerEncoder
-
-
-class AutoformerModelTester:
-    def __init__(
-        self,
-        parent,
-        d_model=16,
-        batch_size=13,
-        prediction_length=7,
-        context_length=14,
-        label_length=10,
-        cardinality=19,
-        embedding_dimension=5,
-        num_time_features=4,
-        is_training=True,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        lags_sequence=[1, 2, 3, 4, 5],
-        moving_average=25,
-        autocorrelation_factor=5,
-    ):
-        self.d_model = d_model
-        self.parent = parent
-        self.batch_size = batch_size
-        self.prediction_length = prediction_length
-        self.context_length = context_length
-        self.cardinality = cardinality
-        self.num_time_features = num_time_features
-        self.lags_sequence = lags_sequence
-        self.embedding_dimension = embedding_dimension
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-
-        self.encoder_seq_length = context_length
-        self.decoder_seq_length = prediction_length + label_length
-        self.label_length = label_length
-
-        self.moving_average = moving_average
-        self.autocorrelation_factor = autocorrelation_factor
-
-    def get_config(self):
-        return AutoformerConfig(
-            d_model=self.d_model,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            prediction_length=self.prediction_length,
-            context_length=self.context_length,
-            label_length=self.label_length,
-            lags_sequence=self.lags_sequence,
-            num_time_features=self.num_time_features,
-            num_static_categorical_features=1,
-            cardinality=[self.cardinality],
-            embedding_dimension=[self.embedding_dimension],
-            moving_average=self.moving_average,
-        )
-
-    def prepare_autoformer_inputs_dict(self, config):
-        _past_length = config.context_length + max(config.lags_sequence)
-
-        static_categorical_features = ids_tensor(
-            [self.batch_size, 1], config.cardinality[0])
-        past_time_features = floats_tensor(
-            [self.batch_size, _past_length, config.num_time_features])
-        past_values = floats_tensor([self.batch_size, _past_length])
-        past_observed_mask = floats_tensor(
-            [self.batch_size, _past_length]) > 0.5
-
-        # decoder inputs
-        future_time_features = floats_tensor(
-            [self.batch_size, config.prediction_length, config.num_time_features])
-        future_values = floats_tensor(
-            [self.batch_size, config.prediction_length])
-
-        inputs_dict = {
-            "past_values": past_values,
-            "static_categorical_features": static_categorical_features,
-            "past_time_features": past_time_features,
-            "past_observed_mask": past_observed_mask,
-            "future_time_features": future_time_features,
-            "future_values": future_values,
-        }
-        return inputs_dict
-
-    def prepare_config_and_inputs(self):
-        config = self.get_config()
-        inputs_dict = self.prepare_autoformer_inputs_dict(config)
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = AutoformerModel(config=config).set_train(False)
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = AutoformerEncoder.from_pretrained(
-                tmpdirname)
-
-        transformer_inputs, feature, _, _, _ = model.create_network_inputs(
-            **inputs_dict)
-        seasonal_input, trend_input = model.decomposition_layer(
-            transformer_inputs[:, : config.context_length, ...])
-
-        enc_input = ops.cat(
-            (transformer_inputs[:, : config.context_length, ...],
-             feature[:, : config.context_length, ...]),
-            dim=-1,
-        )
-        encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0]
-        self.parent.assertTrue(
-            (encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        mean = (
-            ops.mean(
-                transformer_inputs[:, : config.context_length, ...], dim=1)
-            .unsqueeze(1)
-            .tile((1, config.prediction_length, 1))
-        )
-        zeros = ops.zeros(transformer_inputs.shape[0], config.prediction_length,
-                          transformer_inputs.shape[2])
-
-        dec_input = ops.cat(
-            (
-                ops.cat(
-                    (seasonal_input[:, -config.label_length:, ...], zeros), dim=1),
-                feature[:, config.context_length - config.label_length:, ...],
-            ),
-            dim=-1,
-        )
-        trend_init = ops.cat(
-            (
-                ops.cat(
-                    (trend_input[:, -config.label_length:, ...], mean), dim=1),
-                feature[:, config.context_length - config.label_length:, ...],
-            ),
-            dim=-1,
-        )
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = AutoformerDecoder.from_pretrained(
-                tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            trend=trend_init,
-            inputs_embeds=dec_input,
-            encoder_hidden_states=encoder_last_hidden_state,
-        )[0]
-
-        self.parent.assertTrue(
-            (last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-
-class AutoformerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        AutoformerModel, AutoformerForPrediction) if is_mindspore_available() else ()
-    all_generative_model_classes = (
-        AutoformerForPrediction,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {
-        "feature-extraction": AutoformerModel} if is_mindspore_available() else {}
-    test_pruning = False
-    test_head_masking = False
-    test_missing_keys = False
-    test_torchscript = False
-    test_inputs_embeds = False
-    test_model_get_set_embeddings = False
-
-    def setUp(self):
-        self.model_tester = AutoformerModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=AutoformerConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(
-                    tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(
-            *config_and_inputs)
-
-    @unittest.skip(reason="Model does not have input embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Model has no tokens embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip('complex do not support gradient.')
-    def test_training(self):
-        pass
-
-    # # Input is 'static_categorical_features' not 'input_ids'
-    def test_model_main_input_name(self):
-        model_signature = inspect.signature(
-            getattr(AutoformerModel, "forward"))
-        # The main input is the name of the argument after `self`
-        observed_main_input_name = list(model_signature.parameters.keys())[1]
-        self.assertEqual(AutoformerModel.main_input_name,
-                         observed_main_input_name)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "past_values",
-                "past_time_features",
-                "past_observed_mask",
-                "static_categorical_features",
-                "static_real_features",
-                "future_values",
-                "future_time_features",
-            ]
-
-            if model.__class__.__name__ in ["AutoformerForPrediction"]:
-                expected_arg_names.append("future_observed_mask")
-
-            expected_arg_names.extend(
-                [
-                    "decoder_attention_mask",
-                    "head_mask",
-                    "decoder_head_mask",
-                    "cross_attn_head_mask",
-                    "encoder_outputs",
-                    "past_key_values",
-                    "output_hidden_states",
-                    "output_attentions",
-                    "use_cache",
-                    "return_dict",
-                ]
-            )
-
-            self.assertListEqual(
-                arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(
-            self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(
-            self.model_tester, "encoder_seq_length", seq_len)
-        d_model = getattr(self.model_tester, "d_model", None)
-        num_attention_heads = getattr(
-            self.model_tester, "num_attention_heads", None)
-        dim = d_model // num_attention_heads
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(
-                len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
-            self.assertEqual(
-                len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, dim],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 7
-
-            if "last_hidden_state" in outputs:
-                correct_outlen += 1
-
-            if "trend" in outputs:
-                correct_outlen += 1
-
-            if "past_key_values" in outputs:
-                correct_outlen += 1  # past_key_values have been returned
-
-            if "loss" in outputs:
-                correct_outlen += 1
-
-            if "params" in outputs:
-                correct_outlen += 1
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(len(decoder_attentions),
-                             self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, dim],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions),
-                             self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, dim],
-            )
-
-        # Check attention is always last and order is fine
-        inputs_dict["output_attentions"] = True
-        inputs_dict["output_hidden_states"] = True
-        model = model_class(config)
-        model.set_train(False)
-        outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-        self.assertEqual(out_len + 2, len(outputs))
-
-        self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-        self.assertEqual(len(self_attentions),
-                         self.model_tester.num_hidden_layers)
-        self.assertListEqual(
-            list(self_attentions[0].shape[-3:]),
-            [self.model_tester.num_attention_heads, encoder_seq_length, dim],
-        )
-
-
-def prepare_batch(filename="train-batch.pt"):
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
-    batch = mindspore.load(file)
-    return batch
-
-
-
-class AutoformerModelIntegrationTests(unittest.TestCase):
-    @unittest.skip('Mindspore cannot load torch .pt file.')
-    def test_inference_no_head(self):
-        model = AutoformerModel.from_pretrained(
-            "huggingface/autoformer-tourism-monthly")
-        batch = prepare_batch()
-
-        output = model(
-                past_values=batch["past_values"],
-                past_time_features=batch["past_time_features"],
-                past_observed_mask=batch["past_observed_mask"],
-                static_categorical_features=batch["static_categorical_features"],
-                future_values=batch["future_values"],
-                future_time_features=batch["future_time_features"],
-            )[0]
-
-        expected_shape = (64, model.config.prediction_length +
-             model.config.label_length, model.config.feature_size)
-
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[0.3593, -1.3398, 0.6330], [0.2279, 1.5396, -0.1792], [0.0450, 1.3225, -0.2335]]
-        )
-        self.assertTrue(mindspore.allclose(
-            output[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=TOLERANCE))
-
-    @unittest.skip('Mindspore cannot load torch .pt file.')
-    def test_inference_head(self):
-        model = AutoformerForPrediction.from_pretrained(
-            "huggingface/autoformer-tourism-monthly")
-        batch = prepare_batch("val-batch.pt")
-        output = model(
-                past_values=batch["past_values"],
-                past_time_features=batch["past_time_features"],
-                past_observed_mask=batch["past_observed_mask"],
-                static_categorical_features=batch["static_categorical_features"],
-            ).encoder_last_hidden_state
-        expected_shape = (64, model.config.context_length, model.config.d_model)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[-0.0734, -0.9036, 0.8358], [4.7186, 2.4113, 1.9581], [1.7953, 2.3558, 1.2970]]
-        )
-        self.assertTrue(mindspore.allclose(
-            output[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=TOLERANCE))
-
-    @unittest.skip('Mindspore cannot load torch .pt file.')
-    def test_seq_to_seq_generation(self):
-        model = AutoformerForPrediction.from_pretrained(
-            "huggingface/autoformer-tourism-monthly")
-        batch = prepare_batch("val-batch.pt")
-        outputs = model.generate(
-                static_categorical_features=batch["static_categorical_features"],
-                past_time_features=batch["past_time_features"],
-                past_values=batch["past_values"],
-                future_time_features=batch["future_time_features"],
-                past_observed_mask=batch["past_observed_mask"],
-            )
-        expected_shape = (64, model.config.num_parallel_samples, model.config.prediction_length)
-        self.assertEqual(outputs.sequences.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [3130.6763, 4056.5293, 7053.0786])
-        mean_prediction = outputs.sequences.mean(axis=1)
-        self.assertTrue(mindspore.allclose(
-            mean_prediction[0, -3:].asnumpy(), expected_slice.asnumpy(), rtol=1e-1))
diff --git a/tests/transformers/models/baichuan/__init__.py b/tests/transformers/models/baichuan/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/bark/__init__.py b/tests/transformers/models/bark/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/bark/test_modeling_bark.py b/tests/transformers/models/bark/test_modeling_bark.py
deleted file mode 100644
index ca0568b22..000000000
--- a/tests/transformers/models/bark/test_modeling_bark.py
+++ /dev/null
@@ -1,1169 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Bark model."""
-
-import copy
-import inspect
-import tempfile
-import unittest
-
-
-from mindnlp.transformers import (
-    BarkCoarseConfig,
-    BarkConfig,
-    BarkFineConfig,
-    BarkSemanticConfig,
-)
-from mindnlp.transformers.models.bark.generation_configuration_bark import (
-    BarkCoarseGenerationConfig,
-    BarkFineGenerationConfig,
-    BarkSemanticGenerationConfig,
-)
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    require_mindspore,
-    require_mindspore_gpu,
-    slow,
-)
-from mindnlp.utils import cached_property
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-from ..encodec.test_modeling_encodec import EncodecModelTester
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, nn, no_grad
-    from mindnlp.engine import set_seed
-
-    from mindnlp.transformers import (
-        BarkCausalModel,
-        BarkCoarseModel,
-        BarkFineModel,
-        BarkModel,
-        BarkProcessor,
-        BarkSemanticModel,
-    )
-
-class BarkSemanticModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,  # need batch_size != num_hidden_layers
-        seq_length=4,
-        is_training=False,  # for now training is not supported
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=33,
-        output_vocab_size=33,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        intermediate_size=15,
-        dropout=0.1,
-        window_size=256,
-        initializer_range=0.02,
-        n_codes_total=8,  # for BarkFineModel
-        n_codes_given=1,  # for BarkFineModel
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.output_vocab_size = output_vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.window_size = window_size
-        self.initializer_range = initializer_range
-        self.bos_token_id = output_vocab_size - 1
-        self.eos_token_id = output_vocab_size - 1
-        self.pad_token_id = output_vocab_size - 1
-
-        self.n_codes_total = n_codes_total
-        self.n_codes_given = n_codes_given
-
-        self.is_encoder_decoder = False
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "head_mask": head_mask,
-            "attention_mask": input_mask,
-        }
-
-        return config, inputs_dict
-
-    def get_config(self):
-        return BarkSemanticConfig(
-            vocab_size=self.vocab_size,
-            output_vocab_size=self.output_vocab_size,
-            hidden_size=self.hidden_size,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_attention_heads,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            window_size=self.window_size,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        config.output_vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = BarkSemanticModel(config=config).eval()
-
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["logits"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "logits"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-        # test no attention_mask works
-        outputs = model(input_ids, use_cache=True)
-        _, past_key_values = outputs.to_tuple()
-        output_from_no_past = model(next_input_ids)["logits"]
-
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["logits"]
-
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-
-class BarkCoarseModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,  # need batch_size != num_hidden_layers
-        seq_length=4,
-        is_training=False,  # for now training is not supported
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=33,
-        output_vocab_size=33,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        intermediate_size=15,
-        dropout=0.1,
-        window_size=256,
-        initializer_range=0.02,
-        n_codes_total=8,  # for BarkFineModel
-        n_codes_given=1,  # for BarkFineModel
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.output_vocab_size = output_vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.window_size = window_size
-        self.initializer_range = initializer_range
-        self.bos_token_id = output_vocab_size - 1
-        self.eos_token_id = output_vocab_size - 1
-        self.pad_token_id = output_vocab_size - 1
-
-        self.n_codes_total = n_codes_total
-        self.n_codes_given = n_codes_given
-
-        self.is_encoder_decoder = False
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "head_mask": head_mask,
-            "attention_mask": input_mask,
-        }
-
-        return config, inputs_dict
-
-    def get_config(self):
-        return BarkCoarseConfig(
-            vocab_size=self.vocab_size,
-            output_vocab_size=self.output_vocab_size,
-            hidden_size=self.hidden_size,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_attention_heads,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            window_size=self.window_size,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        config.output_vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = BarkCoarseModel(config=config).eval()
-
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["logits"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "logits"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-        # test no attention_mask works
-        outputs = model(input_ids, use_cache=True)
-        _, past_key_values = outputs.to_tuple()
-        output_from_no_past = model(next_input_ids)["logits"]
-
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["logits"]
-
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-
-class BarkFineModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,  # need batch_size != num_hidden_layers
-        seq_length=4,
-        is_training=False,  # for now training is not supported
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=33,
-        output_vocab_size=33,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        intermediate_size=15,
-        dropout=0.1,
-        window_size=256,
-        initializer_range=0.02,
-        n_codes_total=8,  # for BarkFineModel
-        n_codes_given=1,  # for BarkFineModel
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.output_vocab_size = output_vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.window_size = window_size
-        self.initializer_range = initializer_range
-        self.bos_token_id = output_vocab_size - 1
-        self.eos_token_id = output_vocab_size - 1
-        self.pad_token_id = output_vocab_size - 1
-
-        self.n_codes_total = n_codes_total
-        self.n_codes_given = n_codes_given
-
-        self.is_encoder_decoder = False
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length, self.n_codes_total], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        # randint between self.n_codes_given - 1 and self.n_codes_total - 1
-        codebook_idx = ids_tensor((1,), self.n_codes_total - self.n_codes_given).item() + self.n_codes_given
-
-        inputs_dict = {
-            "codebook_idx": codebook_idx,
-            "input_ids": input_ids,
-            "head_mask": head_mask,
-            "attention_mask": input_mask,
-        }
-
-        return config, inputs_dict
-
-    def get_config(self):
-        return BarkFineConfig(
-            vocab_size=self.vocab_size,
-            output_vocab_size=self.output_vocab_size,
-            hidden_size=self.hidden_size,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_attention_heads,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            window_size=self.window_size,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        config.output_vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = BarkFineModel(config=config).eval()
-
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["logits"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "logits"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-        # test no attention_mask works
-        outputs = model(input_ids, use_cache=True)
-        _, past_key_values = outputs.to_tuple()
-        output_from_no_past = model(next_input_ids)["logits"]
-
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["logits"]
-
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-
-class BarkModelTester:
-    def __init__(
-        self,
-        parent,
-        semantic_kwargs=None,
-        coarse_acoustics_kwargs=None,
-        fine_acoustics_kwargs=None,
-        codec_kwargs=None,
-        is_training=False,  # for now training is not supported
-    ):
-        if semantic_kwargs is None:
-            semantic_kwargs = {}
-        if coarse_acoustics_kwargs is None:
-            coarse_acoustics_kwargs = {}
-        if fine_acoustics_kwargs is None:
-            fine_acoustics_kwargs = {}
-        if codec_kwargs is None:
-            codec_kwargs = {}
-
-        self.parent = parent
-        self.semantic_model_tester = BarkSemanticModelTester(parent, **semantic_kwargs)
-        self.coarse_acoustics_model_tester = BarkCoarseModelTester(parent, **coarse_acoustics_kwargs)
-        self.fine_acoustics_model_tester = BarkFineModelTester(parent, **fine_acoustics_kwargs)
-        self.codec_model_tester = EncodecModelTester(parent, **codec_kwargs)
-
-        self.is_training = is_training
-
-    def get_config(self):
-        return BarkConfig.from_sub_model_configs(
-            self.semantic_model_tester.get_config(),
-            self.coarse_acoustics_model_tester.get_config(),
-            self.fine_acoustics_model_tester.get_config(),
-            self.codec_model_tester.get_config(),
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-
-        # follow the `get_pipeline_config` of the sub component models
-        config.semantic_config.vocab_size = 300
-        config.coarse_acoustics_config.vocab_size = 300
-        config.fine_acoustics_config.vocab_size = 300
-
-        config.semantic_config.output_vocab_size = 300
-        config.coarse_acoustics_config.output_vocab_size = 300
-        config.fine_acoustics_config.output_vocab_size = 300
-
-        return config
-
-
-@require_mindspore
-class BarkSemanticModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (BarkSemanticModel,) if is_mindspore_available() else ()
-    all_generative_model_classes = (BarkCausalModel,) if is_mindspore_available() else ()
-
-    is_encoder_decoder = False
-    fx_compatible = False
-    test_missing_keys = False
-    test_pruning = False
-    test_model_parallel = False
-    # no model_parallel for now
-
-    test_resize_embeddings = True
-
-    def setUp(self):
-        self.model_tester = BarkSemanticModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BarkSemanticConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-
-            wte = model.get_input_embeddings()
-            inputs["input_embeds"] = wte(input_ids)
-
-            with no_grad():
-                model(**inputs)[0]
-
-    # override as the input arg is called "input_embeds", not "inputs_embeds"
-    def test_inputs_embeds_matches_input_ids(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            with no_grad():
-                out_ids = model(**inputs)[0]
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-
-            wte = model.get_input_embeddings()
-            inputs["input_embeds"] = wte(input_ids)
-            with no_grad():
-                out_embeds = model(**inputs)[0]
-
-            self.assertTrue(ops.allclose(out_embeds, out_ids))
-
-    @require_mindspore
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = self.all_generative_model_classes[0](config).eval()
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-
-@require_mindspore
-class BarkCoarseModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    # Same tester as BarkSemanticModelTest, except for model_class and config_class
-    all_model_classes = (BarkCoarseModel,) if is_mindspore_available() else ()
-    all_generative_model_classes = (BarkCausalModel,) if is_mindspore_available() else ()
-
-    is_encoder_decoder = False
-    fx_compatible = False
-    test_missing_keys = False
-    test_pruning = False
-    test_model_parallel = False
-    # no model_parallel for now
-
-    test_resize_embeddings = True
-
-    def setUp(self):
-        self.model_tester = BarkCoarseModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BarkCoarseConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-
-            wte = model.get_input_embeddings()
-            inputs["input_embeds"] = wte(input_ids)
-
-            with no_grad():
-                model(**inputs)[0]
-
-    # override as the input arg is called "input_embeds", not "inputs_embeds"
-    def test_inputs_embeds_matches_input_ids(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            with no_grad():
-                out_ids = model(**inputs)[0]
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-
-            wte = model.get_input_embeddings()
-            inputs["input_embeds"] = wte(input_ids)
-            with no_grad():
-                out_embeds = model(**inputs)[0]
-
-            self.assertTrue(ops.allclose(out_embeds, out_ids))
-
-    @require_mindspore
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = self.all_generative_model_classes[0](config).eval()
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-
-@require_mindspore
-class BarkFineModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (BarkFineModel,) if is_mindspore_available() else ()
-
-    is_encoder_decoder = False
-    fx_compatible = False
-    test_missing_keys = False
-    test_pruning = False
-    # no model_parallel for now
-    test_model_parallel = False
-
-    # torchscript disabled for now because forward with an int
-    test_torchscript = False
-
-    test_resize_embeddings = True
-
-    def setUp(self):
-        self.model_tester = BarkFineModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BarkFineConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-
-            wte = model.get_input_embeddings()[inputs_dict["codebook_idx"]]
-
-            inputs["input_embeds"] = wte(input_ids[:, :, inputs_dict["codebook_idx"]])
-
-            with no_grad():
-                model(**inputs)[0]
-
-    @unittest.skip(reason="FineModel relies on codebook idx and does not return same logits")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @require_mindspore
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        # take first codebook channel
-
-        model = self.all_model_classes[0](config).eval()
-        model.half()
-
-        # toy generation_configs
-        semantic_generation_config = BarkSemanticGenerationConfig(semantic_vocab_size=0)
-        coarse_generation_config = BarkCoarseGenerationConfig(n_coarse_codebooks=config.n_codes_given)
-        fine_generation_config = BarkFineGenerationConfig(
-            max_fine_history_length=config.block_size // 2,
-            max_fine_input_length=config.block_size,
-            n_fine_codebooks=config.n_codes_total,
-        )
-        codebook_size = config.vocab_size - 1
-
-        model.generate(
-            input_ids,
-            history_prompt=None,
-            temperature=None,
-            semantic_generation_config=semantic_generation_config,
-            coarse_generation_config=coarse_generation_config,
-            fine_generation_config=fine_generation_config,
-            codebook_size=codebook_size,
-        )
-
-        model.generate(
-            input_ids,
-            history_prompt=None,
-            temperature=0.7,
-            semantic_generation_config=semantic_generation_config,
-            coarse_generation_config=coarse_generation_config,
-            fine_generation_config=fine_generation_config,
-            codebook_size=codebook_size,
-        )
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["codebook_idx", "input_ids"]
-            self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    def test_model_get_set_embeddings(self):
-        # one embedding layer per codebook
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings()[0], (nn.Embedding))
-            model.set_input_embeddings(
-                nn.ModuleList([nn.Embedding(10, 10) for _ in range(config.n_codes_total)])
-            )
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x[0], nn.Linear))
-
-    def test_resize_tokens_embeddings(self):
-        # resizing tokens_embeddings of a ModuleList
-        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            self.skipTest(reason="test_resize_embeddings is False")
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            if self.model_tester.is_training is False:
-                model.eval()
-
-            model_vocab_size = config.vocab_size
-            # Retrieve the embeddings and clone theme
-            model_embed_list = model.resize_token_embeddings(model_vocab_size)
-            cloned_embeddings_list = [model_embed.weight.clone() for model_embed in model_embed_list]
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_embed_list = model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-
-            # Check that it actually resizes the embeddings matrix for each codebook
-            for model_embed, cloned_embeddings in zip(model_embed_list, cloned_embeddings_list):
-                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model_embed_list = model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            for model_embed, cloned_embeddings in zip(model_embed_list, cloned_embeddings_list):
-                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["input_ids"] = inputs_dict["input_ids"].clamp(max=model_vocab_size - 15 - 1)
-
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            # only check for the first embedding matrix
-            models_equal = True
-            for p1, p2 in zip(cloned_embeddings_list[0], model_embed_list[0].weight):
-                if p1.ne(p2).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-    def test_resize_embeddings_untied(self):
-        # resizing tokens_embeddings of a ModuleList
-        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            self.skipTest(reason="test_resize_embeddings is False")
-
-        original_config.tie_word_embeddings = False
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            # if no output embeddings -> leave test
-            if model.get_output_embeddings() is None:
-                continue
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_vocab_size = config.vocab_size
-            model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            output_embeds_list = model.get_output_embeddings()
-
-            for output_embeds in output_embeds_list:
-                self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
-
-                # Check bias if present
-                if output_embeds.bias is not None:
-                    self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            output_embeds_list = model.get_output_embeddings()
-
-            for output_embeds in output_embeds_list:
-                self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
-                # Check bias if present
-                if output_embeds.bias is not None:
-                    self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["input_ids"] = inputs_dict["input_ids"].clamp(max=model_vocab_size - 15 - 1)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-
-@require_mindspore
-class BarkModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def model(self):
-        return BarkModel.from_pretrained("suno/bark")
-
-    @cached_property
-    def processor(self):
-        return BarkProcessor.from_pretrained("suno/bark")
-
-    @cached_property
-    def inputs(self):
-        input_ids = self.processor("In the light of the moon, a little egg lay on a leaf", voice_preset="en_speaker_6")
-
-        return input_ids
-
-    @cached_property
-    def semantic_generation_config(self):
-        semantic_generation_config = BarkSemanticGenerationConfig(**self.model.generation_config.semantic_config)
-        return semantic_generation_config
-
-    @cached_property
-    def coarse_generation_config(self):
-        coarse_generation_config = BarkCoarseGenerationConfig(**self.model.generation_config.coarse_acoustics_config)
-        return coarse_generation_config
-
-    @cached_property
-    def fine_generation_config(self):
-        fine_generation_config = BarkFineGenerationConfig(**self.model.generation_config.fine_acoustics_config)
-        return fine_generation_config
-
-    @slow
-    def test_generate_semantic(self):
-        input_ids = self.inputs
-
-        # check first ids
-        expected_output_ids = [7363, 321, 41, 1461, 6915, 952, 326, 41, 41, 927,]  # fmt: skip
-
-        # greedy decoding
-        with no_grad():
-            output_ids = self.model.semantic.generate(
-                **input_ids,
-                do_sample=False,
-                temperature=1.0,
-                semantic_generation_config=self.semantic_generation_config,
-            )
-        self.assertListEqual(output_ids[0, : len(expected_output_ids)].tolist(), expected_output_ids)
-
-    @slow
-    def test_generate_semantic_early_stop(self):
-        input_ids = self.inputs
-        min_eos_p = 0.01
-
-        # check first ids
-        expected_output_ids = [7363, 321, 41, 1461, 6915, 952, 326, 41, 41, 927,]  # fmt: skip
-
-        # Should be able to read min_eos_p from kwargs
-        with no_grad():
-            set_seed(0)
-            output_ids_without_min_eos_p = self.model.semantic.generate(
-                **input_ids,
-                do_sample=False,
-                temperature=0.9,
-                semantic_generation_config=self.semantic_generation_config,
-            )
-            set_seed(0)
-            output_ids_kwargs = self.model.semantic.generate(
-                **input_ids,
-                do_sample=False,
-                temperature=0.9,
-                semantic_generation_config=self.semantic_generation_config,
-                min_eos_p=min_eos_p,
-            )
-        self.assertListEqual(output_ids_without_min_eos_p[0, : len(expected_output_ids)].tolist(), expected_output_ids)
-        self.assertLess(len(output_ids_kwargs[0, :].tolist()), len(output_ids_without_min_eos_p[0, :].tolist()))
-
-        # Should be able to read min_eos_p from the semantic generation config
-        self.semantic_generation_config.min_eos_p = min_eos_p
-        with no_grad():
-            set_seed(0)
-            output_ids = self.model.semantic.generate(
-                **input_ids,
-                do_sample=False,
-                temperature=0.9,
-                semantic_generation_config=self.semantic_generation_config,
-            )
-
-        self.assertEqual(output_ids.shape, output_ids_kwargs.shape)
-        self.assertLess(len(output_ids[0, :].tolist()), len(output_ids_without_min_eos_p[0, :].tolist()))
-        self.assertListEqual(output_ids[0, : len(expected_output_ids)].tolist(), expected_output_ids)
-
-    @slow
-    def test_generate_coarse(self):
-        input_ids = self.inputs
-
-        history_prompt = input_ids["history_prompt"]
-
-        # check first ids
-        expected_output_ids = [11018, 11391, 10651, 11418, 10857, 11620, 10642, 11366, 10312, 11528, 10531, 11516, 10474, 11051, 10524, 11051, ]  # fmt: skip
-
-        with no_grad():
-            output_ids = self.model.semantic.generate(
-                **input_ids,
-                do_sample=False,
-                temperature=1.0,
-                semantic_generation_config=self.semantic_generation_config,
-            )
-
-            output_ids = self.model.coarse_acoustics.generate(
-                output_ids,
-                history_prompt=history_prompt,
-                do_sample=False,
-                temperature=1.0,
-                semantic_generation_config=self.semantic_generation_config,
-                coarse_generation_config=self.coarse_generation_config,
-                codebook_size=self.model.generation_config.codebook_size,
-            )
-
-        self.assertListEqual(output_ids[0, : len(expected_output_ids)].tolist(), expected_output_ids)
-
-    @slow
-    def test_generate_fine(self):
-        input_ids = self.inputs
-
-        history_prompt = input_ids["history_prompt"]
-
-        # fmt: off
-        expected_output_ids = [
-            [1018, 651, 857, 642, 312, 531, 474, 524, 524, 776,],
-            [367, 394, 596, 342, 504, 492, 27, 27, 822, 822,],
-            [961, 955, 221, 955, 955, 686, 939, 939, 479, 176,],
-            [638, 365, 218, 944, 853, 363, 639, 22, 884, 456,],
-            [302, 912, 524, 38, 174, 209, 879, 23, 910, 227,],
-            [440, 673, 861, 666, 372, 558, 49, 172, 232, 342,],
-            [244, 358, 123, 356, 586, 520, 499, 877, 542, 637,],
-            [806, 685, 905, 848, 803, 810, 921, 208, 625, 203,],
-        ]
-        # fmt: on
-
-        with no_grad():
-            output_ids = self.model.semantic.generate(
-                **input_ids,
-                do_sample=False,
-                temperature=1.0,
-                semantic_generation_config=self.semantic_generation_config,
-            )
-
-            output_ids = self.model.coarse_acoustics.generate(
-                output_ids,
-                history_prompt=history_prompt,
-                do_sample=False,
-                temperature=1.0,
-                semantic_generation_config=self.semantic_generation_config,
-                coarse_generation_config=self.coarse_generation_config,
-                codebook_size=self.model.generation_config.codebook_size,
-            )
-
-            # greedy decoding
-            output_ids = self.model.fine_acoustics.generate(
-                output_ids,
-                history_prompt=history_prompt,
-                temperature=None,
-                semantic_generation_config=self.semantic_generation_config,
-                coarse_generation_config=self.coarse_generation_config,
-                fine_generation_config=self.fine_generation_config,
-                codebook_size=self.model.generation_config.codebook_size,
-            )
-
-        self.assertListEqual(output_ids[0, :, : len(expected_output_ids[0])].tolist(), expected_output_ids)
-
-    @slow
-    def test_generate_end_to_end(self):
-        input_ids = self.inputs
-
-        with no_grad():
-            self.model.generate(**input_ids)
-            self.model.generate(**{key: val for (key, val) in input_ids.items() if key != "history_prompt"})
-
-    @slow
-    def test_generate_end_to_end_with_args(self):
-        input_ids = self.inputs
-
-        with no_grad():
-            self.model.generate(**input_ids, do_sample=True, temperature=0.6, penalty_alpha=0.6)
-            self.model.generate(**input_ids, do_sample=True, temperature=0.6, num_beams=4)
-
-    @slow
-    def test_generate_batching(self):
-        args = {"do_sample": False, "temperature": None}
-
-        s1 = "I love HuggingFace"
-        s2 = "In the light of the moon, a little egg lay on a leaf"
-        voice_preset = "en_speaker_6"
-        input_ids = self.processor([s1, s2], voice_preset=voice_preset)
-
-        # generate in batch
-        outputs, audio_lengths = self.model.generate(**input_ids, **args, return_output_lengths=True)
-
-        # generate one-by-one
-        s1 = self.processor(s1, voice_preset=voice_preset)
-        s2 = self.processor(s2, voice_preset=voice_preset)
-        output1 = self.model.generate(**s1, **args)
-        output2 = self.model.generate(**s2, **args)
-
-        # up until the coarse acoustic model (included), results are the same
-        # the fine acoustic model introduces small differences
-        # first verify if same length (should be the same because it's decided in the coarse model)
-        self.assertEqual(tuple(audio_lengths), (output1.shape[1], output2.shape[1]))
-
-        # then assert almost equal
-        self.assertTrue(ops.allclose(outputs[0, : audio_lengths[0]], output1.squeeze(), atol=2e-3))
-        self.assertTrue(ops.allclose(outputs[1, : audio_lengths[1]], output2.squeeze(), atol=2e-3))
-
-        # now test single input with return_output_lengths = True
-        outputs, _ = self.model.generate(**s1, **args, return_output_lengths=True)
-        self.assertTrue((outputs == output1).all().item())
-
-    @slow
-    def test_generate_end_to_end_with_sub_models_args(self):
-        input_ids = self.inputs
-
-        with no_grad():
-            set_seed(0)
-            self.model.generate(
-                **input_ids, do_sample=False, temperature=1.0, coarse_do_sample=True, coarse_temperature=0.7
-            )
-            output_ids_without_min_eos_p = self.model.generate(
-                **input_ids,
-                do_sample=True,
-                temperature=0.9,
-                coarse_do_sample=True,
-                coarse_temperature=0.7,
-                fine_temperature=0.3,
-            )
-
-            output_ids_with_min_eos_p = self.model.generate(
-                **input_ids,
-                do_sample=True,
-                temperature=0.9,
-                coarse_temperature=0.7,
-                fine_temperature=0.3,
-                min_eos_p=0.1,
-            )
-        self.assertLess(
-            len(output_ids_with_min_eos_p[0, :].tolist()), len(output_ids_without_min_eos_p[0, :].tolist())
-        )
-
-    def assertListAlmostEqual(self, list1, list2, tol=1e-6):
-        self.assertEqual(len(list1), len(list2))
-        for a, b in zip(list1, list2):
-            self.assertAlmostEqual(a, b, delta=tol)
\ No newline at end of file
diff --git a/tests/transformers/models/bart/__init__.py b/tests/transformers/models/bart/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/bart/test_modeling_bart.py b/tests/transformers/models/bart/test_modeling_bart.py
deleted file mode 100644
index 2dc1e5a9a..000000000
--- a/tests/transformers/models/bart/test_modeling_bart.py
+++ /dev/null
@@ -1,1512 +0,0 @@
-# coding=utf-8
-# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore BART model. """
-
-
-import copy
-import tempfile
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import BartConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-)
-from mindnlp.utils import cached_property, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        AutoModelForSequenceClassification,
-        BartForCausalLM,
-        BartForConditionalGeneration,
-        BartForQuestionAnswering,
-        BartForSequenceClassification,
-        BartModel,
-        BartTokenizer,
-    )
-    from mindnlp.transformers.models.bart.modeling_bart import BartDecoder, BartEncoder, shift_tokens_right
-
-def prepare_bart_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids=None,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(config.encoder_layers, config.encoder_attention_heads)
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-class BartModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-        # forcing a certain token to be generated, sets all other tokens to -inf
-        # if however the token to be generated is already at -inf then it can lead token
-        # `nan` values and thus break generation
-        self.forced_bos_token_id = None
-        self.forced_eos_token_id = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-        inputs_dict = prepare_bart_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def get_config(self):
-        return BartConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            forced_bos_token_id=self.forced_bos_token_id,
-            forced_eos_token_id=self.forced_eos_token_id,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.max_position_embeddings = 100
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = BartModel(config=config).get_decoder().set_train(False)
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-        head_mask = inputs_dict["head_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask.astype(mindspore.bool_)], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = BartModel(config=config).set_train(False)
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = BartEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
-            0
-        ]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = BartDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-@require_mindspore
-class BartHeadTests(unittest.TestCase):
-    vocab_size = 99
-
-    def _get_config_and_data(self):
-        input_ids = mindspore.tensor(
-            [
-                [71, 82, 18, 33, 46, 91, 2],
-                [68, 34, 26, 58, 30, 82, 2],
-                [5, 97, 17, 39, 94, 40, 2],
-                [76, 83, 94, 25, 70, 78, 2],
-                [87, 59, 41, 35, 48, 66, 2],
-                [55, 13, 16, 58, 5, 2, 1],  # note padding
-                [64, 27, 31, 51, 12, 75, 2],
-                [52, 64, 86, 17, 83, 39, 2],
-                [48, 61, 9, 24, 71, 82, 2],
-                [26, 1, 60, 48, 22, 13, 2],
-                [21, 5, 62, 28, 14, 76, 2],
-                [45, 98, 37, 86, 59, 48, 2],
-                [70, 70, 50, 9, 28, 0, 2],
-            ],
-            dtype=mindspore.int64,
-        )
-
-        batch_size = input_ids.shape[0]
-        config = BartConfig(
-            vocab_size=self.vocab_size,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-        )
-        return config, input_ids, batch_size
-
-    def test_sequence_classification_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        labels = _long_tensor([2] * batch_size)
-        model = BartForSequenceClassification(config)
-        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=labels)
-        expected_shape = (batch_size, config.num_labels)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-        self.assertIsInstance(outputs["loss"].item(), float)
-
-    def test_question_answering_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        sequence_labels = ids_tensor([batch_size], 2)
-        model = BartForQuestionAnswering(config)
-        outputs = model(
-            input_ids=input_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-
-        self.assertEqual(outputs["start_logits"].shape, input_ids.shape)
-        self.assertEqual(outputs["end_logits"].shape, input_ids.shape)
-        self.assertIsInstance(outputs["loss"].item(), float)
-
-    def test_lm_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size)
-        lm_model = BartForConditionalGeneration(config)
-        outputs = lm_model(input_ids=input_ids, labels=lm_labels)
-        expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-        self.assertIsInstance(outputs["loss"].item(), float)
-
-    def test_lm_uneven_forward(self):
-        config = BartConfig(
-            vocab_size=self.vocab_size,
-            d_model=14,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=8,
-            decoder_ffn_dim=8,
-            max_position_embeddings=48,
-        )
-        lm_model = BartForConditionalGeneration(config)
-        context = mindspore.tensor(
-            [[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]], dtype=mindspore.int64
-        )
-        summary = mindspore.tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]], dtype=mindspore.int64)
-        outputs = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary)
-        expected_shape = (*summary.shape, config.vocab_size)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-
-    def test_generate_beam_search(self):
-        input_ids = mindspore.tensor([[71, 82, 2], [68, 34, 2]], dtype=mindspore.int64)
-        config = BartConfig(
-            vocab_size=self.vocab_size,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-        )
-        lm_model = BartForConditionalGeneration(config)
-        lm_model.set_train(False)
-
-        max_length = 5
-        generated_ids = lm_model.generate(
-            input_ids.copy(),
-            do_sample=True,
-            num_return_sequences=1,
-            num_beams=2,
-            no_repeat_ngram_size=3,
-            max_length=max_length,
-        )
-        self.assertEqual(generated_ids.shape, (input_ids.shape[0], max_length))
-
-    def test_shift_tokens_right(self):
-        input_ids = mindspore.tensor([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]], dtype=mindspore.int64)
-        shifted = shift_tokens_right(input_ids, 1, 2)
-        n_pad_before = input_ids.eq(1).float().sum()
-        n_pad_after = shifted.eq(1).float().sum()
-        self.assertEqual(shifted.shape, input_ids.shape)
-        self.assertEqual(n_pad_after, n_pad_before - 1)
-        self.assertTrue(ops.eq(shifted[:, 0], 2).all())
-
-    @slow
-    def test_tokenization(self):
-        tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
-        examples = [" Hello world", " DomDramg"]  # need leading spaces for equality
-        fairseq_results = [
-            mindspore.tensor([0, 20920, 232, 2]),
-            mindspore.tensor([0, 11349, 495, 4040, 571, 2]),
-        ]
-        for ex, desired_result in zip(examples, fairseq_results):
-            bart_toks = tokenizer.encode(ex, return_tensors="ms").squeeze()
-            assert_tensors_close(desired_result.long(), bart_toks, prefix=ex)
-
-    @require_mindspore
-    def test_generate_fp16(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        attention_mask = input_ids.ne(1)
-        model = BartForConditionalGeneration(config).set_train(False)
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    def test_dummy_inputs(self):
-        config, *_ = self._get_config_and_data()
-        model = BartForConditionalGeneration(config).set_train(False)
-        model(**model.dummy_inputs)
-
-    def test_resize_tokens_embeddings_more(self):
-        config, input_ids, _ = self._get_config_and_data()
-
-        def _get_embs(m):
-            return (m.get_input_embeddings().weight.copy(), m.get_output_embeddings().weight.copy())
-
-        model = BartForConditionalGeneration(config).set_train(False)
-        input, output = _get_embs(model)
-        self.assertTrue(ops.eq(input, output).all())
-        new_vocab_size = 45
-        model.resize_token_embeddings(new_vocab_size)
-        input_new, output_new = _get_embs(model)
-        self.assertEqual(input_new.shape, (new_vocab_size, config.d_model))
-        self.assertEqual(output_new.shape, (new_vocab_size, config.d_model))
-        self.assertTrue(ops.eq(input_new, output_new).all())
-
-
-@require_mindspore
-class BartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (BartModel, BartForConditionalGeneration, BartForSequenceClassification, BartForQuestionAnswering)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (BartForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "conversational": BartForConditionalGeneration,
-            "feature-extraction": BartModel,
-            "fill-mask": BartForConditionalGeneration,
-            "question-answering": BartForQuestionAnswering,
-            "summarization": BartForConditionalGeneration,
-            "text-classification": BartForSequenceClassification,
-            "text-generation": BartForCausalLM,
-            "text2text-generation": BartForConditionalGeneration,
-            "translation": BartForConditionalGeneration,
-            "zero-shot": BartForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    fx_compatible = False  # Fix me Michael
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = BartModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BartConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    # BartForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (BartModel, BartForConditionalGeneration, BartForQuestionAnswering):
-            model = model_class(config)
-            model
-            model.set_train(False)
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            model(**inputs)[0]
-
-    @require_mindspore
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = BartForConditionalGeneration(config).set_train(False)
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    @unittest.skip("Does not support conversations.")
-    def test_pipeline_conversational(self):
-        pass
-
-
-def assert_tensors_close(a, b, atol=1e-12, prefix=""):
-    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if np.allclose(a.asnumpy(), b.asnumpy(), atol=atol):
-            return True
-        raise
-    except Exception:
-        pct_different = (ops.gt((a - b).abs(), atol)).float().mean().item()
-        if a.numel() > 100:
-            msg = f"tensor values are {pct_different:.1%} percent different."
-        else:
-            msg = f"{a} != {b}"
-        if prefix:
-            msg = prefix + ": " + msg
-        raise AssertionError(msg)
-
-
-def _long_tensor(tok_lst):
-    return mindspore.tensor(tok_lst, dtype=mindspore.int64)
-
-
-@require_mindspore
-@slow
-class FastIntegrationTests(unittest.TestCase):
-    """These tests are useful for debugging since they operate on a model with 1 encoder layer and 1 decoder layer."""
-
-    @cached_property
-    def tok(self):
-        return BartTokenizer.from_pretrained("facebook/bart-large")
-
-    @cached_property
-    def xsum_1_1_model(self):
-        return BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-xsum-1-1")
-
-    def test_xsum_1_1_generation(self):
-        hf = self.xsum_1_1_model
-        tok = self.tok
-        ARTICLE = (
-            "The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
-            " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The"
-            " formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based."
-            " The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its"
-            ' jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East'
-            ' Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the'
-            " situation in Palestinian territories, paving the way for possible war crimes investigations against"
-            " Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and"
-            " the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the"
-            " body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, said it was a"
-            ' move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the'
-            ' world is also a step closer to ending a long era of impunity and injustice," he said, according to an'
-            ' ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge'
-            " Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the"
-            ' Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine'
-            " acquires all the rights as well as responsibilities that come with being a State Party to the Statute."
-            ' These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights'
-            ' Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should'
-            " immediately end their pressure, and countries that support universal acceptance of the court's treaty"
-            ' should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the'
-            " group. \"What's objectionable is the attempts to undermine international justice, not Palestine's"
-            ' decision to join a treaty to which over 100 countries around the world are members." In January, when'
-            " the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an"
-            ' outrage, saying the court was overstepping its boundaries. The United States also said it "strongly"'
-            " disagreed with the court's decision. \"As we have said repeatedly, we do not believe that Palestine is a"
-            ' state and therefore we do not believe that it is eligible to join the ICC," the State Department said in'
-            ' a statement. It urged the warring sides to resolve their differences through direct negotiations. "We'
-            ' will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace,"'
-            " it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the"
-            ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the'
-            " court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou"
-            ' Bensouda said her office would "conduct its analysis in full independence and impartiality." The war'
-            " between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry"
-            " will include alleged war crimes committed since June. The International Criminal Court was set up in"
-            " 2002 to prosecute genocide, crimes against humanity and war crimes."
-        )
-        EXPECTED = (
-            " The International Criminal Court (ICC) has announced that it has been announced by the International"
-            " Criminal court."
-        )
-
-        dct = tok(ARTICLE, return_tensors="ms")
-        generated_ids = hf.generate(**dct, num_beams=4)
-        result = tok.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        assert EXPECTED == result
-
-    def test_xsum_1_1_batch_generation(self):
-        # test batch
-
-        batch = self.tok(
-            [
-                "The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
-                " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories."
-                " The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is"
-                " based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted"
-                ' its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including'
-                ' East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination'
-                " into the situation in Palestinian territories, paving the way for possible war crimes investigations"
-                " against Israelis. As members of the court, Palestinians may be subject to counter-charges as well."
-                " Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts"
-                " to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony,"
-                ' said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome'
-                ' Statute today, the world is also a step closer to ending a long era of impunity and injustice," he'
-                ' said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of'
-                ' justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was'
-                ' just the first step for the Palestinians. "As the Rome Statute today enters into force for the State'
-                " of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a"
-                ' State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she'
-                ' said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize'
-                " Palestine for joining the ICC should immediately end their pressure, and countries that support"
-                " universal acceptance of the court's treaty should speak out to welcome its membership,\" said"
-                " Balkees Jarrah, international justice counsel for the group. \"What's objectionable is the attempts"
-                " to undermine international justice, not Palestine's decision to join a treaty to which over 100"
-                ' countries around the world are members." In January, when the preliminary ICC examination was'
-                " opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was"
-                ' overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s'
-                ' decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we'
-                ' do not believe that it is eligible to join the ICC," the State Department said in a statement. It'
-                ' urged the warring sides to resolve their differences through direct negotiations. "We will continue'
-                ' to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said.'
-                " But the ICC begs to differ with the definition of a state for its purposes and refers to the"
-                ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows'
-                " the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor"
-                ' Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality."'
-                " The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The"
-                " inquiry will include alleged war crimes committed since June. The International Criminal Court was"
-                " set up in 2002 to prosecute genocide, crimes against humanity and war crimes.",
-                "The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted"
-                " Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor"
-                ' Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A'
-                " person who has such a video needs to immediately give it to the investigators.\" Robin's comments"
-                " follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video"
-                " showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the"
-                " French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was"
-                " recovered from a phone at the wreckage site. The two publications described the supposed video, but"
-                " did not post it on their websites. The publications said that they watched the video, which was"
-                " found by a source close to the investigation. \"One can hear cries of 'My God' in several"
-                ' languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps'
-                " of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy"
-                ' shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing'
-                " scene,\" said Julian Reichelt, editor-in-chief of Bild online. An official with France's accident"
-                " investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc"
-                " Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the"
-                ' Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell'
-                ' phones have been collected at the site, he said, but that they "hadn\'t been exploited yet."'
-                " Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute"
-                " in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working"
-                " hand-in-hand with investigators. But none of the cell phones found so far have been sent to the"
-                " institute, Menichini said. Asked whether staff involved in the search could have leaked a memory"
-                ' card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett:'
-                ' Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are'
-                ' "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered'
-                ' cell phones from the crash site after Bild and Paris Match published their reports. "That is'
-                " something we did not know before. ... Overall we can say many things of the investigation weren't"
-                ' revealed by the investigation at the beginning," he said. What was mental state of Germanwings'
-                " co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled"
-                " depression years before he took the controls of Germanwings Flight 9525, which he's accused of"
-                " deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school"
-                ' in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email'
-                " correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa"
-                " said, included medical documents he submitted in connection with resuming his flight training. The"
-                " announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz's battle"
-                " with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa,"
-                " whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday"
-                ' as a "swift and seamless clarification" and said it was sharing the information and documents --'
-                " including training and medical records -- with public prosecutors. Spohr traveled to the crash site"
-                " Wednesday, where recovery teams have been working for the past week to recover human remains and"
-                " plane debris scattered across a steep mountainside. He saw the crisis center set up in"
-                " Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving"
-                " families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no"
-                " visible human remains were left at the site but recovery teams would keep searching. French"
-                " President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the"
-                " victims using DNA analysis by the end of the week, sooner than authorities had previously suggested."
-                " In the meantime, the recovery of the victims' personal belongings will start Wednesday, Menichini"
-                " said. Among those personal belongings could be more cell phones belonging to the 144 passengers and"
-                " six crew on board. Check out the latest from our correspondents . The details about Lubitz's"
-                " correspondence with the flight school during his training were among several developments as"
-                " investigators continued to delve into what caused the crash and Lubitz's possible motive for"
-                " downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical"
-                ' certificate, had passed all his examinations and "held all the licenses required." Earlier, a'
-                " spokesman for the prosecutor's office in Dusseldorf, Christoph Kumpa, said medical records reveal"
-                " Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent"
-                " psychotherapy before he got his pilot's license. Kumpa emphasized there's no evidence suggesting"
-                " Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether"
-                " Lubitz feared his medical condition would cause him to lose his pilot's license, a European"
-                ' government official briefed on the investigation told CNN on Tuesday. While flying was "a big part'
-                " of his life,\" the source said, it's only one theory being considered. Another source, a law"
-                " enforcement official briefed on the investigation, also told CNN that authorities believe the"
-                " primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly"
-                " because of his medical problems. Lubitz's girlfriend told investigators he had seen an eye doctor"
-                " and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had"
-                " psychological issues, the European government official said. But no matter what details emerge about"
-                " his previous mental health struggles, there's more to the story, said Brian Russell, a forensic"
-                ' psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the'
-                " fact that maybe they weren't going to keep doing their job and they're upset about that and so"
-                ' they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels'
-                " entitled to also take that rage and turn it outward on 149 other people who had nothing to do with"
-                " the person's problems.\" Germanwings crash compensation: What we know . Who was the captain of"
-                " Germanwings Flight 9525? CNN's Margot Haddad reported from Marseille and Pamela Brown from"
-                " Dusseldorf, while Laura Smith-Spark wrote from London. CNN's Frederik Pleitgen, Pamela Boykoff,"
-                " Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.",
-            ],
-            return_tensors="ms",
-            padding="longest",
-            truncation=True,
-        )
-        generated_ids = self.xsum_1_1_model.generate(**batch, num_beams=4)
-        result = self.tok.batch_decode(generated_ids, skip_special_tokens=True)
-        assert (
-            result[0]
-            == " The International Criminal Court (ICC) has announced that it has been announced by the International"
-            " Criminal court."
-        )
-        assert (
-            result[1]
-            == " An investigation into the crash that killed at least 10 people in the French capital has been"
-            " released by the French police investigating the crash."
-        )
-
-    def test_encoder_equiv(self):
-        # test batch
-
-        batch = self.tok(
-            [
-                "The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
-                " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories."
-                " The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is"
-                " based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted"
-                ' its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including'
-                ' East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination'
-                " into the situation in Palestinian territories, paving the way for possible war crimes investigations"
-                " against Israelis. As members of the court, Palestinians may be subject to counter-charges as well."
-                " Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts"
-                " to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony,"
-                ' said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome'
-                ' Statute today, the world is also a step closer to ending a long era of impunity and injustice," he'
-                ' said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of'
-                ' justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was'
-                ' just the first step for the Palestinians. "As the Rome Statute today enters into force for the State'
-                " of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a"
-                ' State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she'
-                ' said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize'
-                " Palestine for joining the ICC should immediately end their pressure, and countries that support"
-                " universal acceptance of the court's treaty should speak out to welcome its membership,\" said"
-                " Balkees Jarrah, international justice counsel for the group. \"What's objectionable is the attempts"
-                " to undermine international justice, not Palestine's decision to join a treaty to which over 100"
-                ' countries around the world are members." In January, when the preliminary ICC examination was'
-                " opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was"
-                ' overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s'
-                ' decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we'
-                ' do not believe that it is eligible to join the ICC," the State Department said in a statement. It'
-                ' urged the warring sides to resolve their differences through direct negotiations. "We will continue'
-                ' to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said.'
-                " But the ICC begs to differ with the definition of a state for its purposes and refers to the"
-                ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows'
-                " the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor"
-                ' Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality."'
-                " The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The"
-                " inquiry will include alleged war crimes committed since June. The International Criminal Court was"
-                " set up in 2002 to prosecute genocide, crimes against humanity and war crimes.",
-                "The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted"
-                " Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor"
-                ' Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A'
-                " person who has such a video needs to immediately give it to the investigators.\" Robin's comments"
-                " follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video"
-                " showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the"
-                " French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was"
-                " recovered from a phone at the wreckage site. The two publications described the supposed video, but"
-                " did not post it on their websites. The publications said that they watched the video, which was"
-                " found by a source close to the investigation. \"One can hear cries of 'My God' in several"
-                ' languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps'
-                " of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy"
-                ' shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing'
-                " scene,\" said Julian Reichelt, editor-in-chief of Bild online. An official with France's accident"
-                " investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc"
-                " Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the"
-                ' Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell'
-                ' phones have been collected at the site, he said, but that they "hadn\'t been exploited yet."'
-                " Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute"
-                " in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working"
-                " hand-in-hand with investigators. But none of the cell phones found so far have been sent to the"
-                " institute, Menichini said. Asked whether staff involved in the search could have leaked a memory"
-                ' card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett:'
-                ' Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are'
-                ' "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered'
-                ' cell phones from the crash site after Bild and Paris Match published their reports. "That is'
-                " something we did not know before. ... Overall we can say many things of the investigation weren't"
-                ' revealed by the investigation at the beginning," he said. What was mental state of Germanwings'
-                " co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled"
-                " depression years before he took the controls of Germanwings Flight 9525, which he's accused of"
-                " deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school"
-                ' in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email'
-                " correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa"
-                " said, included medical documents he submitted in connection with resuming his flight training. The"
-                " announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz's battle"
-                " with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa,"
-                " whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday"
-                ' as a "swift and seamless clarification" and said it was sharing the information and documents --'
-                " including training and medical records -- with public prosecutors. Spohr traveled to the crash site"
-                " Wednesday, where recovery teams have been working for the past week to recover human remains and"
-                " plane debris scattered across a steep mountainside. He saw the crisis center set up in"
-                " Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving"
-                " families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no"
-                " visible human remains were left at the site but recovery teams would keep searching. French"
-                " President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the"
-                " victims using DNA analysis by the end of the week, sooner than authorities had previously suggested."
-                " In the meantime, the recovery of the victims' personal belongings will start Wednesday, Menichini"
-                " said. Among those personal belongings could be more cell phones belonging to the 144 passengers and"
-                " six crew on board. Check out the latest from our correspondents . The details about Lubitz's"
-                " correspondence with the flight school during his training were among several developments as"
-                " investigators continued to delve into what caused the crash and Lubitz's possible motive for"
-                " downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical"
-                ' certificate, had passed all his examinations and "held all the licenses required." Earlier, a'
-                " spokesman for the prosecutor's office in Dusseldorf, Christoph Kumpa, said medical records reveal"
-                " Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent"
-                " psychotherapy before he got his pilot's license. Kumpa emphasized there's no evidence suggesting"
-                " Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether"
-                " Lubitz feared his medical condition would cause him to lose his pilot's license, a European"
-                ' government official briefed on the investigation told CNN on Tuesday. While flying was "a big part'
-                " of his life,\" the source said, it's only one theory being considered. Another source, a law"
-                " enforcement official briefed on the investigation, also told CNN that authorities believe the"
-                " primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly"
-                " because of his medical problems. Lubitz's girlfriend told investigators he had seen an eye doctor"
-                " and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had"
-                " psychological issues, the European government official said. But no matter what details emerge about"
-                " his previous mental health struggles, there's more to the story, said Brian Russell, a forensic"
-                ' psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the'
-                " fact that maybe they weren't going to keep doing their job and they're upset about that and so"
-                ' they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels'
-                " entitled to also take that rage and turn it outward on 149 other people who had nothing to do with"
-                " the person's problems.\" Germanwings crash compensation: What we know . Who was the captain of"
-                " Germanwings Flight 9525? CNN's Margot Haddad reported from Marseille and Pamela Brown from"
-                " Dusseldorf, while Laura Smith-Spark wrote from London. CNN's Frederik Pleitgen, Pamela Boykoff,"
-                " Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.",
-            ],
-            return_tensors="ms",
-            padding="longest",
-            truncation=True,
-        )
-        features = self.xsum_1_1_model.get_encoder()(**batch).last_hidden_state
-        expected = [[-0.0828, -0.0251, -0.0674], [0.1277, 0.3311, -0.0255], [0.2613, -0.0840, -0.2763]]
-        assert_tensors_close(features[0, :3, :3], mindspore.tensor(expected), atol=1e-3)
-
-
-class BartModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_tokenizer(self):
-        return BartTokenizer.from_pretrained("facebook/bart-large")
-
-    @slow
-    def test_inference_no_head(self):
-        model = BartModel.from_pretrained("facebook/bart-large")
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        attention_mask = input_ids.ne(model.config.pad_token_id)
-        output = model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
-        expected_shape = (1, 11, 1024)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]]
-        )
-        print(output[:, :3, :3].asnumpy())
-        self.assertTrue(np.allclose(output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-3))
-
-    @slow
-    def test_base_mask_filling(self):
-        pbase = pipeline(task="fill-mask", model="facebook/bart-base")
-        src_text = [" I went to the <mask>."]
-        results = [x["token_str"] for x in pbase(src_text)]
-        assert " bathroom" in results
-
-    @slow
-    def test_large_mask_filling(self):
-        plarge = pipeline(task="fill-mask", model="facebook/bart-large")
-        src_text = [" I went to the <mask>."]
-        results = [x["token_str"] for x in plarge(src_text)]
-        expected_results = [" bathroom", " gym", " wrong", " movies", " hospital"]
-        self.assertListEqual(results, expected_results)
-
-    @slow
-    def test_mnli_inference(self):
-        example_b = [0, 31414, 232, 328, 740, 1140, 69, 46078, 1588, 2, 1]
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2], example_b])
-
-        model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
-        # eval called in from_pre
-        attention_mask = input_ids.ne(model.config.pad_token_id)
-        # Test that model hasn't changed
-        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
-
-        batched_logits = outputs.logits
-        expected_shape = (2, 3)
-        self.assertEqual(batched_logits.shape, expected_shape)
-        expected_slice = mindspore.tensor([[0.1907, 1.4342, -1.0289]])
-        logits_arr = batched_logits[0]
-
-        # Test that padding does not change results
-        input_ids_no_pad = _long_tensor([example_b[:-1]])
-        attention_mask_no_pad = input_ids_no_pad.ne(model.config.pad_token_id)
-
-        logits2 = model(input_ids=input_ids_no_pad, attention_mask=attention_mask_no_pad).logits.squeeze()
-        assert_tensors_close(batched_logits[1], logits2, atol=1e-3)
-        assert_tensors_close(expected_slice, logits_arr, atol=1e-3)
-
-    @slow
-    def test_xsum_summarization_same_as_fairseq(self):
-        model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-xsum")
-        tok = self.default_tokenizer
-
-        PGE_ARTICLE = """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
-
-        EXPECTED_SUMMARY = (
-            "California's largest power company has begun shutting off electricity to thousands of customers in the"
-            " state."
-        )
-        dct = tok.batch_encode_plus(
-            [PGE_ARTICLE],
-            max_length=1024,
-            padding="max_length",
-            truncation=True,
-            return_tensors="ms",
-        )
-
-        hypotheses_batch = model.generate(
-            input_ids=dct["input_ids"],
-            attention_mask=dct["attention_mask"],
-            num_beams=2,
-            max_length=62,
-            min_length=11,
-            length_penalty=1.0,
-            no_repeat_ngram_size=3,
-            early_stopping=True,
-            decoder_start_token_id=model.config.eos_token_id,
-        )
-
-        decoded = tok.batch_decode(
-            hypotheses_batch,
-            skip_special_tokens=True,
-        )
-        self.assertEqual(EXPECTED_SUMMARY, decoded[0])
-
-    def test_xsum_config_generation_params(self):
-        config = BartConfig.from_pretrained("facebook/bart-large-xsum")
-        expected_params = {"num_beams": 6, "do_sample": False, "early_stopping": True, "length_penalty": 1.0}
-        config_params = {k: getattr(config, k, "MISSING") for k, v in expected_params.items()}
-        self.assertDictEqual(expected_params, config_params)
-
-    @slow
-    def test_cnn_summarization_same_as_fairseq(self):
-        hf = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
-        tok = BartTokenizer.from_pretrained("facebook/bart-large")
-
-        FRANCE_ARTICLE = (  # @noq
-            " Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings"
-            " Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane."
-            ' Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation."'
-            ' He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s'
-            " comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video"
-            " showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French"
-            " Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a"
-            " phone at the wreckage site. The two publications described the supposed video, but did not post it on"
-            " their websites. The publications said that they watched the video, which was found by a source close to"
-            " the investigation. \"One can hear cries of 'My God' in several languages,\" Paris Match reported."
-            ' "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the'
-            " cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the"
-            ' screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt,'
-            " editor-in-chief of Bild online. An official with France's accident investigation agency, the BEA, said"
-            " the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman"
-            " in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the"
-            ' reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said,'
-            ' but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be'
-            " sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by"
-            " specialized technicians working hand-in-hand with investigators. But none of the cell phones found so"
-            " far have been sent to the institute, Menichini said. Asked whether staff involved in the search could"
-            ' have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin'
-            ' Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match'
-            ' are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered'
-            ' cell phones from the crash site after Bild and Paris Match published their reports. "That is something'
-            " we did not know before. ... Overall we can say many things of the investigation weren't revealed by the"
-            ' investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline'
-            " Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the"
-            " controls of Germanwings Flight 9525, which he's accused of deliberately crashing last week in the"
-            ' French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of'
-            ' severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school'
-            " discovered in an internal investigation, Lufthansa said, included medical documents he submitted in"
-            " connection with resuming his flight training. The announcement indicates that Lufthansa, the parent"
-            " company of Germanwings, knew of Lubitz's battle with depression, allowed him to continue training and"
-            " ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100%"
-            ' fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was'
-            " sharing the information and documents -- including training and medical records -- with public"
-            " prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the"
-            " past week to recover human remains and plane debris scattered across a steep mountainside. He saw the"
-            " crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash"
-            " site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late"
-            " Tuesday that no visible human remains were left at the site but recovery teams would keep searching."
-            " French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all"
-            " the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested."
-            " In the meantime, the recovery of the victims' personal belongings will start Wednesday, Menichini said."
-            " Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew"
-            " on board. Check out the latest from our correspondents . The details about Lubitz's correspondence with"
-            " the flight school during his training were among several developments as investigators continued to"
-            " delve into what caused the crash and Lubitz's possible motive for downing the jet. A Lufthansa"
-            " spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his"
-            ' examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in'
-            " Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at"
-            " some point before his aviation career and underwent psychotherapy before he got his pilot's license."
-            " Kumpa emphasized there's no evidence suggesting Lubitz was suicidal or acting aggressively before the"
-            " crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to"
-            " lose his pilot's license, a European government official briefed on the investigation told CNN on"
-            ' Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being'
-            " considered. Another source, a law enforcement official briefed on the investigation, also told CNN that"
-            " authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would"
-            " not be allowed to fly because of his medical problems. Lubitz's girlfriend told investigators he had"
-            " seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded"
-            " he had psychological issues, the European government official said. But no matter what details emerge"
-            " about his previous mental health struggles, there's more to the story, said Brian Russell, a forensic"
-            ' psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact'
-            " that maybe they weren't going to keep doing their job and they're upset about that and so they're"
-            ' suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to'
-            " also take that rage and turn it outward on 149 other people who had nothing to do with the person's"
-            ' problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight'
-            " 9525? CNN's Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura"
-            " Smith-Spark wrote from London. CNN's Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine"
-            " Amiel and Anna-Maja Rappard contributed to this report."
-        )
-
-        SHORTER_ARTICLE = (
-            " (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
-            " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The"
-            " formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based."
-            " The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its"
-            ' jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East'
-            ' Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the'
-            " situation in Palestinian territories, paving the way for possible war crimes investigations against"
-            " Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and"
-            " the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the"
-            " body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, said it was a"
-            ' move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the'
-            ' world is also a step closer to ending a long era of impunity and injustice," he said, according to an'
-            ' ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge'
-            " Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the"
-            ' Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine'
-            " acquires all the rights as well as responsibilities that come with being a State Party to the Statute."
-            ' These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights'
-            ' Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should'
-            " immediately end their pressure, and countries that support universal acceptance of the court's treaty"
-            ' should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the'
-            " group. \"What's objectionable is the attempts to undermine international justice, not Palestine's"
-            ' decision to join a treaty to which over 100 countries around the world are members." In January, when'
-            " the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an"
-            ' outrage, saying the court was overstepping its boundaries. The United States also said it "strongly"'
-            " disagreed with the court's decision. \"As we have said repeatedly, we do not believe that Palestine is a"
-            ' state and therefore we do not believe that it is eligible to join the ICC," the State Department said in'
-            ' a statement. It urged the warring sides to resolve their differences through direct negotiations. "We'
-            ' will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace,"'
-            " it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the"
-            ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the'
-            " court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou"
-            ' Bensouda said her office would "conduct its analysis in full independence and impartiality." The war'
-            " between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry"
-            " will include alleged war crimes committed since June. The International Criminal Court was set up in"
-            " 2002 to prosecute genocide, crimes against humanity and war crimes. CNN's Vasco Cotovio, Kareem Khadder"
-            " and Faith Karimi contributed to this report."
-        )
-
-        # The below article tests that we don't add any hypotheses outside of the top n_beams
-        IRAN_ARTICLE = (
-            " (CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran"
-            " in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively"
-            " block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger."
-            " Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli"
-            " Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a"
-            " letter to the Iranian leadership warning them away from a deal. The debate that has already begun since"
-            " the announcement of the new framework will likely result in more heat than light. It will not be helped"
-            " by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: ."
-            " The most misleading assertion, despite universal rejection by experts, is that the negotiations'"
-            " objective at the outset was the total elimination of any nuclear program in Iran. That is the position"
-            " of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it"
-            " had been, there would have been no Iranian team at the negotiating table. Rather, the objective has"
-            " always been to structure an agreement or series of agreements so that Iran could not covertly develop a"
-            " nuclear arsenal before the United States and its allies could respond. The new framework has exceeded"
-            " expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by"
-            " two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another"
-            " dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite"
-            " sharp accusations by some in the United States and its allies, Iran denies having such a program, and"
-            " U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's"
-            " continued cooperation with International Atomic Energy Agency inspections is further evidence on this"
-            " point, and we'll know even more about Iran's program in the coming months and years because of the deal."
-            " In fact, the inspections provisions that are part of this agreement are designed to protect against any"
-            " covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that"
-            " the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter"
-            " warning that a deal might be killed by Congress or a future president). This of course is not the case."
-            " The talks were between Iran and the five permanent members of the U.N. Security Council (United States,"
-            " United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has"
-            " played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement"
-            " reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran"
-            " and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement"
-            " contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the"
-            " case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased"
-            " or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes"
-            " Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear"
-            " sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going"
-            " forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such"
-            " a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the"
-            ' agreement should be a formal treaty requiring the Senate to "advise and consent." But the issue is not'
-            " suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New"
-            " START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement"
-            " with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement"
-            " will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove"
-            " most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally"
-            " some insist that any agreement must address Iranian missile programs, human rights violations or support"
-            " for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are"
-            " unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in"
-            " the negotiations would be a poison pill. This agreement should be judged on its merits and on how it"
-            " affects the security of our negotiating partners and allies, including Israel. Those judgments should be"
-            " fact-based, not based on questionable assertions or dubious assumptions."
-        )
-
-        ARTICLE_SUBWAY = (
-            " New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
-            " year later, she got married again in Westchester County, but to a different man and without divorcing"
-            " her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos"
-            ' declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married'
-            " once more, this time in the Bronx. In an application for a marriage license, she stated it was her"
-            ' "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false'
-            ' instrument for filing in the first degree," referring to her false statements on the 2010 marriage'
-            " license application, according to court documents. Prosecutors said the marriages were part of an"
-            " immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to"
-            " her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was"
-            " arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New"
-            " York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total,"
-            " Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All"
-            " occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be"
-            " married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors"
-            " said the immigration scam involved some of her husbands, who filed for permanent residence status"
-            " shortly after the marriages.  Any divorces happened only after such filings were approved. It was"
-            " unclear whether any of the men will be prosecuted. The case was referred to the Bronx District"
-            " Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's"
-            ' Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt,'
-            " Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his"
-            " native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces"
-            " up to four years in prison.  Her next court appearance is scheduled for May 18."
-        )
-
-        dct = tok.batch_encode_plus(
-            [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY],
-            max_length=1024,
-            padding="max_length",
-            truncation_strategy="only_first",
-            truncation=True,
-            return_tensors="ms",
-        )
-
-        self.assertEqual(1024, dct["input_ids"].shape[1])
-        hypotheses_batch = hf.generate(
-            input_ids=dct["input_ids"],
-            attention_mask=dct["attention_mask"],
-            num_beams=2,
-        )
-        assert hypotheses_batch[:, 1].eq(0).all().item()
-
-        EXPECTED = [
-            "A French prosecutor says he is not aware of any video footage from on board the plane. Two German "
-            "magazines claim to have found a cell phone video showing the crash. The publications say they watched "
-            "the video, which was found by a source close to the investigation. All 150 on board Germanwings Flight "
-            "9525 were killed.",
-            "Palestinian Authority becomes 123rd member of the International Criminal Court. The move gives the court "
-            "jurisdiction over alleged crimes in Palestinian territories. Israel and the United States opposed the "
-            "Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki said it was a "
-            "move toward greater justice.",
-            "U.S. and its negotiating partners reached a strong framework agreement with Iran. Peter Bergen: The "
-            "debate that has already begun will likely result in more heat than light. He says critics have made "
-            "dubious assumptions and doubtful assertions. Bergen says the goal was to block Iran from building a "
-            "nuclear weapon.",
-            "Liana Barrientos, 39, has been married 10 times, sometimes within two weeks of each other. Prosecutors "
-            "say the marriages were part of an immigration scam. She pleaded not guilty at State Supreme Court in the "
-            "Bronx on Friday. If convicted, she faces up to four years in prison.",
-        ]
-
-        generated_summaries = tok.batch_decode(
-            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )
-        assert generated_summaries == EXPECTED
-
-    @slow
-    def test_contrastive_search_bart(self):
-        article = (
-            " New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
-            " year later, she got married again in Westchester County, but to a different man and without divorcing"
-            " her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos"
-            ' declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married'
-            " once more, this time in the Bronx. In an application for a marriage license, she stated it was her"
-            ' "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false'
-            ' instrument for filing in the first degree," referring to her false statements on the 2010 marriage'
-            " license application, according to court documents. Prosecutors said the marriages were part of an"
-            " immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to"
-            " her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was"
-            " arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New"
-            " York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total,"
-            " Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All"
-            " occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be"
-            " married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors"
-            " said the immigration scam involved some of her husbands, who filed for permanent residence status"
-            " shortly after the marriages.  Any divorces happened only after such filings were approved. It was"
-            " unclear whether any of the men will be prosecuted. The case was referred to the Bronx District"
-            " Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's"
-            ' Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt,'
-            " Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his"
-            " native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces"
-            " up to four years in prison.  Her next court appearance is scheduled for May 18."
-        )
-        bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
-        input_ids = bart_tokenizer(
-            article, add_special_tokens=False, truncation=True, max_length=512, return_tensors="ms"
-        ).input_ids
-
-        outputs = bart_model.generate(input_ids, penalty_alpha=0.5, top_k=5, max_length=64, num_beams=1)
-        generated_text = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "Liana Barrientos, 39, pleaded not guilty to charges related to false marriage statements. "
-                "Prosecutors say she married at least 10 times, sometimes within two weeks of each other. She is "
-                "accused of being part of an immigration scam to get permanent residency. If convicted, she faces up "
-                "to four years in"
-            ],
-        )
-
-    @slow
-    def test_decoder_attention_mask(self):
-        model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
-        tokenizer = self.default_tokenizer
-        sentence = "UN Chief Says There Is No <mask> in Syria"
-        input_ids = tokenizer(sentence, return_tensors="ms").input_ids
-        padding_size = 3
-        decoder_input_ids = mindspore.tensor(
-            [
-                [model.config.decoder_start_token_id]
-                + padding_size * [model.config.pad_token_id]
-                + [model.config.bos_token_id]
-            ],
-            dtype=mindspore.int64,
-        )
-        decoder_attention_mask = ops.where(decoder_input_ids == model.config.pad_token_id, mindspore.tensor(0), 1)
-        generated_ids = model.generate(
-            input_ids=input_ids,
-            use_cache=False,
-            max_new_tokens=20,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        generated_sentence = tokenizer.batch_decode(generated_ids)[0]
-        expected_sentence = "</s><pad><pad><pad><s>UN Chief Says There Is No Plan B for Peace in Syria</s>"
-        self.assertEqual(generated_sentence, expected_sentence)
-
-
-class BartStandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        d_model=16,
-        decoder_seq_length=7,
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=2,
-        decoder_ffn_dim=32,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=False,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.is_encoder_decoder = is_encoder_decoder
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = BartConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            encoder_layers=self.decoder_layers,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_attention_heads=self.encoder_attention_heads,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.decoder_seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            lm_labels,
-        )
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = BartDecoder(config=config).set_train(False)
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3)
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        model = BartDecoder(config=config).set_train(False)
-
-        # create attention mask
-        attn_mask = ops.ones(*input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones(attn_mask.shape[0], 1, dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class BartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (BartDecoder, BartForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (BartForCausalLM,) if is_mindspore_available() else ()
-    fx_comptatible = True
-    test_pruning = False
-    is_encoder_decoder = False
-    test_missing_keys = False
-
-    def setUp(
-        self,
-    ):
-        self.model_tester = BartStandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=BartConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_attn_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
-        return
-
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
-    def test_left_padding_compatibility(self):
-        pass
diff --git a/tests/transformers/models/beit/__init__.py b/tests/transformers/models/beit/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/beit/test_modeling_beit.py b/tests/transformers/models/beit/test_modeling_beit.py
deleted file mode 100644
index 62c62c45f..000000000
--- a/tests/transformers/models/beit/test_modeling_beit.py
+++ /dev/null
@@ -1,508 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch BEiT model. """
-
-
-import unittest
-
-from datasets import load_dataset
-from packaging import version
-import numpy as np
-
-from mindnlp.transformers import BeitConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_backbone_common import BackboneTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import (
-        BeitBackbone,
-        BeitForImageClassification,
-        BeitForMaskedImageModeling,
-        BeitForSemanticSegmentation,
-        BeitModel,
-    )
-    from mindnlp.transformers.models.auto.modeling_auto import MODEL_FOR_BACKBONE_MAPPING_NAMES, MODEL_MAPPING_NAMES
-    from mindnlp.transformers.models.beit.modeling_beit import BEIT_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
-if is_vision_available():
-    import PIL
-    from PIL import Image
-
-    from mindnlp.transformers import BeitImageProcessor
-
-
-class BeitModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=100,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=4,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-        out_indices=[1, 2, 3, 4],
-        out_features=["stage1", "stage2", "stage3", "stage4"],
-    ):
-        self.parent = parent
-        self.vocab_size = vocab_size
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.out_indices = out_indices
-        self.out_features = out_features
-        self.num_labels = num_labels
-
-        # in BeiT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        pixel_labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels, pixel_labels
-
-    def get_config(self):
-        return BeitConfig(
-            vocab_size=self.vocab_size,
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            out_indices=self.out_indices,
-            out_features=self.out_features,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
-        model = BeitModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_backbone(self, config, pixel_values, labels, pixel_labels):
-        model = BeitBackbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify hidden states
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        expected_height = expected_width = self.image_size // config.patch_size
-        self.parent.assertListEqual(
-            list(result.feature_maps[0].shape), [self.batch_size, self.hidden_size, expected_height, expected_width]
-        )
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = BeitBackbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(
-            list(result.feature_maps[0].shape), [self.batch_size, self.hidden_size, expected_height, expected_width]
-        )
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-
-    def create_and_check_for_masked_lm(self, config, pixel_values, labels, pixel_labels):
-        model = BeitForMaskedImageModeling(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length - 1, self.vocab_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.type_sequence_label_size
-        model = BeitForImageClassification(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = BeitForImageClassification(config)
-        model.set_train(False)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.num_labels
-        model = BeitForSemanticSegmentation(config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size * 2, self.image_size * 2)
-        )
-        result = model(pixel_values, labels=pixel_labels)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size * 2, self.image_size * 2)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels, pixel_labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class BeitModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as BEiT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            BeitModel,
-            BeitForImageClassification,
-            BeitForMaskedImageModeling,
-            BeitForSemanticSegmentation,
-            BeitBackbone,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "image-feature-extraction": BeitModel,
-            "image-classification": BeitForImageClassification,
-            "image-segmentation": BeitForSemanticSegmentation,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = BeitModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BeitConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="BEiT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @require_mindspore
-    @unittest.skip(reason="BEiT has some layers using `add_module` which doesn't work well with `nn.DataParallel`")
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-    @unittest.skip(reason="BEiT does not support feedforward chunking yet")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_backbone(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    def test_for_semantic_segmentation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            # we don't test BeitForMaskedImageModeling
-            if model_class.__name__ in [
-                *MODEL_MAPPING_NAMES.values(),
-                *MODEL_FOR_BACKBONE_MAPPING_NAMES.values(),
-                "BeitForMaskedImageModeling",
-            ]:
-                continue
-
-            model = model_class(config)
-            model.set_train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                # we skip lambda parameters as these require special initial values
-                # determined by config.layer_scale_init_value
-                if "lambda" in name:
-                    continue
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in BEIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = BeitModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class BeitModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224") if is_vision_available() else None
-
-    @slow
-    def test_inference_masked_image_modeling_head(self):
-        model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        pixel_values = image_processor(images=image, return_tensors="ms").pixel_values
-
-        # prepare bool_masked_pos
-        bool_masked_pos = ops.ones((1, 196), dtype=mindspore.bool_)
-
-        # forward pass
-        outputs = model(pixel_values=pixel_values, bool_masked_pos=bool_masked_pos)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 196, 8192)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[-3.2437, 0.5072, -13.9174], [-3.2456, 0.4948, -13.9401], [-3.2033, 0.5121, -13.8550]]
-        )
-
-        self.assertTrue(np.allclose(logits[bool_masked_pos][:3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-2))
-
-    @slow
-    def test_inference_image_classification_head_imagenet_1k(self):
-        model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-1.2385, -1.0987, -1.0108])
-
-        self.assertTrue(np.allclose(logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-        expected_class_idx = 281
-        self.assertEqual(logits.argmax(-1).item(), expected_class_idx)
-
-    @slow
-    def test_inference_image_classification_head_imagenet_22k(self):
-        model = BeitForImageClassification.from_pretrained("microsoft/beit-large-patch16-224-pt22k-ft22k")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 21841)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([1.6881, -0.2787, 0.5901])
-
-        self.assertTrue(np.allclose(logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-        expected_class_idx = 2396
-        self.assertEqual(logits.argmax(-1).item(), expected_class_idx)
-
-    @slow
-    def test_inference_semantic_segmentation(self):
-        model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
-        model = model
-
-        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)
-
-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-        image = Image.open(ds[0]["file"])
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 150, 160, 160)
-        self.assertEqual(logits.shape, expected_shape)
-
-        is_pillow_less_than_9 = version.parse(PIL.__version__) < version.parse("9.0.0")
-
-        if is_pillow_less_than_9:
-            expected_slice = mindspore.tensor(
-                [
-                    [[-4.9225, -2.3954, -3.0522], [-2.8822, -1.0046, -1.7561], [-2.9549, -1.3228, -2.1347]],
-                    [[-5.8168, -3.4129, -4.0778], [-3.8651, -2.2214, -3.0277], [-3.8356, -2.4643, -3.3535]],
-                    [[-0.0078, 3.9952, 4.0754], [2.9856, 4.6944, 5.0035], [3.2413, 4.7813, 4.9969]],
-                ],
-            )
-        else:
-            expected_slice = mindspore.tensor(
-                [
-                    [[-4.8960, -2.3688, -3.0355], [-2.8478, -0.9836, -1.7418], [-2.9449, -1.3332, -2.1456]],
-                    [[-5.8081, -3.4124, -4.1006], [-3.8561, -2.2081, -3.0323], [-3.8365, -2.4601, -3.3669]],
-                    [[-0.0309, 3.9868, 4.0540], [2.9640, 4.6877, 4.9976], [3.2081, 4.7690, 4.9942]],
-                ],
-            )
-
-        self.assertTrue(np.allclose(logits[0, :3, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-    @slow
-    def test_post_processing_semantic_segmentation(self):
-        model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
-
-        image_processor = BeitImageProcessor(do_resize=True, size=640, do_center_crop=False)
-
-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-        image = Image.open(ds[0]["file"])
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
-        expected_shape = (500, 300)
-        self.assertEqual(segmentation[0].shape, expected_shape)
-
-        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
-        expected_shape = (160, 160)
-        self.assertEqual(segmentation[0].shape, expected_shape)
-
-
-@require_mindspore
-class BeitBackboneTest(unittest.TestCase, BackboneTesterMixin):
-    all_model_classes = (BeitBackbone,) if is_mindspore_available() else ()
-    config_class = BeitConfig
-
-    def setUp(self):
-        self.model_tester = BeitModelTester(self)
\ No newline at end of file
diff --git a/tests/transformers/models/bert/__init__.py b/tests/transformers/models/bert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/bert/test_modeling_bert.py b/tests/transformers/models/bert/test_modeling_bert.py
deleted file mode 100644
index b93b9c108..000000000
--- a/tests/transformers/models/bert/test_modeling_bert.py
+++ /dev/null
@@ -1,695 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import tempfile
-import unittest
-
-from mindnlp.transformers import AutoTokenizer, BertConfig
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import (
-    CaptureLogger,
-    require_mindspore,
-    slow,
-    is_mindspore_available
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        BertForMaskedLM,
-        BertForMultipleChoice,
-        BertForNextSentencePrediction,
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BertForTokenClassification,
-        BertLMHeadModel,
-        BertModel,
-        logging,
-    )
-
-mindspore.set_context(pynative_synchronize=True)
-class BertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        """
-        Returns a tiny configuration by default.
-        """
-        return BertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BertModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = BertModel(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = BertLMHeadModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BertForMaskedLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_model_for_causal_lm_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = BertLMHeadModel(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = BertLMHeadModel(config=config).eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_next_sequence_prediction(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BertForNextSentencePrediction(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_pretraining(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BertForPreTraining(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            next_sentence_label=sequence_labels,
-        )
-        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BertForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = BertForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = BertForTokenClassification(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = BertForMultipleChoice(config=config)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class BertModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            BertModel,
-            BertLMHeadModel,
-            BertForMaskedLM,
-            BertForMultipleChoice,
-            BertForNextSentencePrediction,
-            BertForPreTraining,
-            BertForQuestionAnswering,
-            BertForSequenceClassification,
-            BertForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (BertLMHeadModel,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": BertModel,
-            "fill-mask": BertForMaskedLM,
-            "question-answering": BertForQuestionAnswering,
-            "text-classification": BertForSequenceClassification,
-            "text-generation": BertLMHeadModel,
-            "token-classification": BertForTokenClassification,
-            "zero-shot": BertForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = True
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int64
-                )
-                inputs_dict["next_sentence_label"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = BertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    @unittest.skip(reason="Generate needs input ids")
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        # generate only works with input ids for bertforcausalLM
-        pass
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_causal_lm_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_for_causal_lm_as_decoder(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs_relative_pos_emb(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        config_and_inputs[0].position_embedding_type = "relative_key"
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_next_sequence_prediction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_next_sequence_prediction(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_warning_if_padding_and_no_attention_mask(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.model_tester.prepare_config_and_inputs()
-
-        # Set pad tokens in the input_ids
-        input_ids[0, 0] = config.pad_token_id
-
-        # Check for warnings if the attention_mask is missing.
-        logger = logging.get_logger("mindnlp.transformers.modeling_utils")
-        # clear cache so we can test the warning is emitted (from `warning_once`).
-        logger.warning_once.cache_clear()
-
-        with CaptureLogger(logger) as cl:
-            model = BertModel(config=config)
-            model.eval()
-            model(input_ids, attention_mask=None, token_type_ids=token_type_ids)
-        self.assertIn("We strongly recommend passing in an `attention_mask`", cl.out)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google-bert/bert-base-uncased"
-        model = BertModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class BertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        model = BertModel.from_pretrained("google-bert/bert-base-uncased")
-        input_ids = mindspore.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = mindspore.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        with no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = (1, 11, 768)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = mindspore.tensor([[[0.4249, 0.1008, 0.7531], [0.3771, 0.1188, 0.7467], [0.4152, 0.1098, 0.7108]]])
-
-        self.assertTrue(ops.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-3))
-
-    @slow
-    def test_inference_no_head_relative_embedding_key(self):
-        model = BertModel.from_pretrained("zhiheng-huang/bert-base-uncased-embedding-relative-key")
-        input_ids = mindspore.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = mindspore.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        with no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = (1, 11, 768)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[[0.0756, 0.3142, -0.5128], [0.3761, 0.3462, -0.5477], [0.2052, 0.3760, -0.1240]]]
-        )
-
-        self.assertTrue(ops.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-3))
-
-    @slow
-    def test_inference_no_head_relative_embedding_key_query(self):
-        model = BertModel.from_pretrained("zhiheng-huang/bert-base-uncased-embedding-relative-key-query")
-        input_ids = mindspore.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = mindspore.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        with no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = (1, 11, 768)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[[0.6496, 0.3784, 0.8203], [0.8148, 0.5656, 0.2636], [-0.0681, 0.5597, 0.7045]]]
-        )
-
-        print(output[:, 1:4, 1:4])
-        self.assertTrue(ops.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-3))
-
-    def test_sdpa_ignored_mask(self):
-        pkv = []
-
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-random-BertModel", attn_implementation="eager")
-        model_sdpa = BertModel.from_pretrained("hf-internal-testing/tiny-random-BertModel", attn_implementation="sdpa")
-
-        model = model.eval()
-        model_sdpa = model_sdpa.eval()
-
-        for _ in range(model.config.num_hidden_layers):
-            num_heads = model.config.num_attention_heads
-            head_dim = model.config.hidden_size // model.config.num_attention_heads
-            pkv.append([ops.rand(1, num_heads, 3, head_dim), ops.rand(1, num_heads, 3, head_dim)])
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel")
-        inp = tokenizer("I am in Paris and", return_tensors="pt")
-
-        del inp["attention_mask"]
-
-        with no_grad():
-            res_eager = model(**inp)
-            res_sdpa = model_sdpa(**inp)
-            self.assertTrue(
-                ops.allclose(res_eager.last_hidden_state, res_sdpa.last_hidden_state, atol=1e-3, rtol=1e-3)
-            )
-
-            # Case where query length != kv_length.
-            res_eager = model(**inp, past_key_values=pkv)
-            res_sdpa = model_sdpa(**inp, past_key_values=pkv)
-            self.assertTrue(
-                ops.allclose(res_eager.last_hidden_state, res_sdpa.last_hidden_state, atol=1e-3, rtol=1e-3)
-            )
-
-    @slow
-    def test_inference_time(self):
-        import time
-        model = BertModel.from_pretrained("google-bert/bert-base-uncased")
-        input_ids = mindspore.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = mindspore.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        infer_time = []
-        with no_grad():
-            for i in range(20):
-                s = time.time()
-                output = model(input_ids, attention_mask=attention_mask)[0]
-                t = time.time()
-                infer_time.append(t - s)
-        print(infer_time)
diff --git a/tests/transformers/models/bert/test_tokenization_bert.py b/tests/transformers/models/bert/test_tokenization_bert.py
deleted file mode 100644
index 5c78e0fbc..000000000
--- a/tests/transformers/models/bert/test_tokenization_bert.py
+++ /dev/null
@@ -1,343 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import unittest
-
-from mindspore.dataset import GeneratorDataset
-
-from mindnlp.utils.testing_utils import slow
-from mindnlp.transformers import BertTokenizerFast
-from mindnlp.transformers.models.bert.tokenization_bert import (
-    VOCAB_FILES_NAMES,
-    BasicTokenizer,
-    BertTokenizer,
-    WordpieceTokenizer,
-    _is_control,
-    _is_punctuation,
-    _is_whitespace,
-)
-
-from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english
-
-
-class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    tokenizer_class = BertTokenizer
-    rust_tokenizer_class = BertTokenizerFast
-    test_rust_tokenizer = True
-    space_between_special_tokens = True
-    from_pretrained_filter = filter_non_english
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "UNwant\u00E9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        # With lower casing
-        tokenizer = self.get_tokenizer(do_lower_case=True)
-        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
-
-        sequence = "UNwant\u00E9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_chinese(self):
-        tokenizer = BasicTokenizer()
-
-        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
-
-    def test_basic_tokenizer_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
-
-    def test_basic_tokenizer_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_default(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
-
-    def test_basic_tokenizer_no_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_respects_never_split_tokens(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
-        )
-
-    def test_basic_tokenizer_splits_on_punctuation(self):
-        tokenizer = BasicTokenizer()
-        text = "a\n'll !!to?'d of, can't."
-        expected = ["a", "'", "ll", "!", "!", "to", "?", "'", "d", "of", ",", "can", "'", "t", "."]
-        self.assertListEqual(tokenizer.tokenize(text), expected)
-
-    def test_wordpiece_tokenizer(self):
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
-
-        vocab = {}
-        for i, token in enumerate(vocab_tokens):
-            vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
-
-        self.assertListEqual(tokenizer.tokenize(""), [])
-
-        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
-
-        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
-
-    def test_is_whitespace(self):
-        self.assertTrue(_is_whitespace(" "))
-        self.assertTrue(_is_whitespace("\t"))
-        self.assertTrue(_is_whitespace("\r"))
-        self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00A0"))
-
-        self.assertFalse(_is_whitespace("A"))
-        self.assertFalse(_is_whitespace("-"))
-
-    def test_is_control(self):
-        self.assertTrue(_is_control("\u0005"))
-
-        self.assertFalse(_is_control("A"))
-        self.assertFalse(_is_control(" "))
-        self.assertFalse(_is_control("\t"))
-        self.assertFalse(_is_control("\r"))
-
-    def test_is_punctuation(self):
-        self.assertTrue(_is_punctuation("-"))
-        self.assertTrue(_is_punctuation("$"))
-        self.assertTrue(_is_punctuation("`"))
-        self.assertTrue(_is_punctuation("."))
-
-        self.assertFalse(_is_punctuation("A"))
-        self.assertFalse(_is_punctuation(" "))
-
-    def test_clean_text(self):
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
-        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
-
-        self.assertListEqual(
-            [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
-        )
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [101] + text + [102]
-        assert encoded_pair == [101] + text + [102] + text_2 + [102]
-
-    def test_offsets_with_special_characters(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
-                tokens = tokenizer_r.encode_plus(
-                    sentence,
-                    return_attention_mask=False,
-                    return_token_type_ids=False,
-                    return_offsets_mapping=True,
-                    add_special_tokens=True,
-                )
-
-                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
-                expected_results = (
-                    [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "A"),
-                        ((1, 2), ","),
-                        ((3, 5), "na"),
-                        ((5, 6), "##ï"),
-                        ((6, 8), "##ve"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "Allen"),
-                        ((21, 23), "##NL"),
-                        ((23, 24), "##P"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                    if not do_lower_case
-                    else [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "a"),
-                        ((1, 2), ","),
-                        ((3, 8), "naive"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "allen"),
-                        ((21, 23), "##nl"),
-                        ((23, 24), "##p"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                )
-
-                self.assertEqual(
-                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
-                )
-                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
-
-    def test_change_tokenize_chinese_chars(self):
-        list_of_commun_chinese_char = ["的", "人", "有"]
-        text_with_chinese_char = "".join(list_of_commun_chinese_char)
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                kwargs["tokenize_chinese_chars"] = True
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
-                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
-
-                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
-                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
-
-                # it is expected that each Chinese character is not preceded by "##"
-                self.assertListEqual(tokens_without_spe_char_p, list_of_commun_chinese_char)
-                self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
-
-                kwargs["tokenize_chinese_chars"] = False
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
-                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
-
-                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
-                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
-
-                # it is expected that only the first Chinese character is not preceded by "##".
-                expected_tokens = [
-                    f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_commun_chinese_char)
-                ]
-                self.assertListEqual(tokens_without_spe_char_p, expected_tokens)
-                self.assertListEqual(tokens_without_spe_char_r, expected_tokens)
diff --git a/tests/transformers/models/bert_generation/__init__.py b/tests/transformers/models/bert_generation/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/bert_generation/test_modeling_bert_generation.py b/tests/transformers/models/bert_generation/test_modeling_bert_generation.py
deleted file mode 100644
index 65f0976f6..000000000
--- a/tests/transformers/models/bert_generation/test_modeling_bert_generation.py
+++ /dev/null
@@ -1,335 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-import numpy as np
-from mindnlp.transformers import BertGenerationConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import BertGenerationDecoder, BertGenerationEncoder
-
-
-class BertGenerationEncoderTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=50,
-        initializer_range=0.02,
-        use_labels=True,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.use_labels = use_labels
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if self.use_labels:
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, token_labels
-
-    def get_config(self):
-        return BertGenerationConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            input_mask,
-            token_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            token_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        token_labels,
-        **kwargs,
-    ):
-        model = BertGenerationEncoder(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        token_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-        **kwargs,
-    ):
-        config.add_cross_attention = True
-        model = BertGenerationEncoder(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        token_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-        **kwargs,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = BertGenerationDecoder(config=config).set_train(False)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        token_labels,
-        *args,
-    ):
-        model = BertGenerationDecoder(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_ids, input_mask, token_labels = self.prepare_config_and_inputs()
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class BertGenerationEncoderTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (BertGenerationEncoder, BertGenerationDecoder) if is_mindspore_available() else ()
-    all_generative_model_classes = (BertGenerationDecoder,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": BertGenerationEncoder, "text-generation": BertGenerationDecoder}
-        if is_mindspore_available()
-        else {}
-    )
-
-    def setUp(self):
-        self.model_tester = BertGenerationEncoderTester(self)
-        self.config_tester = ConfigTester(self, config_class=BertGenerationConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_bert(self):
-        config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs()
-        config.model_type = "bert"
-        self.model_tester.create_and_check_model(config, input_ids, input_mask, token_labels)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            input_mask,
-            token_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            input_mask,
-            token_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = BertGenerationEncoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class BertGenerationEncoderIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        model = BertGenerationEncoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
-        input_ids = mindspore.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]])
-        output = model(input_ids)[0]
-        expected_shape = (1, 8, 1024)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[[0.1775, 0.0083, -0.0321], [1.6002, 0.1287, 0.3912], [2.1473, 0.5791, 0.6066]]]
-        )
-        self.assertTrue(np.allclose(output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-
-@require_mindspore
-class BertGenerationDecoderIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        model = BertGenerationDecoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
-        input_ids = mindspore.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]])
-        output = model(input_ids)[0]
-        expected_shape = (1, 8, 50358)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[[-0.5788, -2.5994, -3.7054], [0.0438, 4.7997, 1.8795], [1.5862, 6.6409, 4.4638]]]
-        )
-        print(output[:, :3, :3].asnumpy(), expected_slice.asnumpy())
-        self.assertTrue(np.allclose(output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/big_bird/__init__.py b/tests/transformers/models/big_bird/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/big_bird/test_modeling_big_bird.py b/tests/transformers/models/big_bird/test_modeling_big_bird.py
deleted file mode 100644
index 958e4289d..000000000
--- a/tests/transformers/models/big_bird/test_modeling_big_bird.py
+++ /dev/null
@@ -1,943 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore BigBird model."""
-
-import unittest
-
-from mindnlp.transformers import BigBirdConfig, is_mindspore_available
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.transformers.models.big_bird.tokenization_big_bird import BigBirdTokenizer
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        BigBirdForCausalLM,
-        BigBirdForMaskedLM,
-        BigBirdForMultipleChoice,
-        BigBirdForPreTraining,
-        BigBirdForQuestionAnswering,
-        BigBirdForSequenceClassification,
-        BigBirdForTokenClassification,
-        BigBirdModel,
-    )
-
-
-class BigBirdModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        seq_length=128,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu_new",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=256,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        attention_type="block_sparse",
-        use_bias=True,
-        rescale_embeddings=False,
-        block_size=8,
-        num_rand_blocks=3,
-        position_embedding_type="absolute",
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-        self.attention_type = attention_type
-        self.use_bias = use_bias
-        self.rescale_embeddings = rescale_embeddings
-        self.block_size = block_size
-        self.num_rand_blocks = num_rand_blocks
-        self.position_embedding_type = position_embedding_type
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return BigBirdConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_encoder_decoder=False,
-            initializer_range=self.initializer_range,
-            attention_type=self.attention_type,
-            use_bias=self.use_bias,
-            rescale_embeddings=self.rescale_embeddings,
-            block_size=self.block_size,
-            num_random_blocks=self.num_rand_blocks,
-            position_embedding_type=self.position_embedding_type,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BigBirdModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_pretraining(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BigBirdForPreTraining(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            next_sentence_label=sequence_labels,
-        )
-        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, config.num_labels))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = BigBirdModel(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = BigBirdForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BigBirdForMaskedLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = BigBirdForCausalLM(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BigBirdForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = BigBirdForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = BigBirdForTokenClassification(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = BigBirdForMultipleChoice(config=config)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-    def create_and_check_for_auto_padding(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = BigBirdModel(config)
-        model.eval()
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_change_to_full_attn(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = BigBirdModel(config)
-        model.eval()
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        # the config should not be changed
-        self.parent.assertTrue(model.config.attention_type == "block_sparse")
-
-
-@require_mindspore
-class BigBirdModelTest(ModelTesterMixin, unittest.TestCase):
-    # head masking & pruning is currently not supported for big bird
-    test_head_masking = False
-    test_pruning = False
-
-    # torchscript should be possible, but takes prohibitively long to test.
-    # Also torchscript is not an important feature to have in the beginning.
-    test_torchscript = False
-
-    all_model_classes = (
-        (
-            BigBirdModel,
-            BigBirdForPreTraining,
-            BigBirdForMaskedLM,
-            BigBirdForCausalLM,
-            BigBirdForMultipleChoice,
-            BigBirdForQuestionAnswering,
-            BigBirdForSequenceClassification,
-            BigBirdForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (BigBirdForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": BigBirdModel,
-            "fill-mask": BigBirdForMaskedLM,
-            "question-answering": BigBirdForQuestionAnswering,
-            "text-classification": BigBirdForSequenceClassification,
-            "text-generation": BigBirdForCausalLM,
-            "token-classification": BigBirdForTokenClassification,
-            "zero-shot": BigBirdForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int64
-                )
-                inputs_dict["next_sentence_label"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = BigBirdModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BigBirdConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # bigbird cannot keep gradients in attentions when `attention_type=block_sparse`
-
-        if self.model_tester.attention_type == "original_full":
-            super().test_retain_grad_hidden_states_attentions()
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/bigbird-roberta-base"
-        model = BigBirdForPreTraining.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_model_various_attn_type(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["original_full", "block_sparse"]:
-            config_and_inputs[0].attention_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip
-    def test_fast_integration(self):
-        # fmt: off
-        input_ids = mindspore.tensor(
-            [[6, 117, 33, 36, 70, 22, 63, 31, 71, 72, 88, 58, 109, 49, 48, 116, 92, 6, 19, 95, 118, 100, 80, 111, 93, 2, 31, 84, 26, 5, 6, 82, 46, 96, 109, 4, 39, 19, 109, 13, 92, 31, 36, 90, 111, 18, 75, 6, 56, 74, 16, 42, 56, 92, 69, 108, 127, 81, 82, 41, 106, 19, 44, 24, 82, 121, 120, 65, 36, 26, 72, 13, 36, 98, 43, 64, 8, 53, 100, 92, 51, 122, 66, 17, 61, 50, 104, 127, 26, 35, 94, 23, 110, 71, 80, 67, 109, 111, 44, 19, 51, 41, 86, 71, 76, 44, 18, 68, 44, 77, 107, 81, 98, 126, 100, 2, 49, 98, 84, 39, 23, 98, 52, 46, 10, 82, 121, 73],[6, 117, 33, 36, 70, 22, 63, 31, 71, 72, 88, 58, 109, 49, 48, 116, 92, 6, 19, 95, 118, 100, 80, 111, 93, 2, 31, 84, 26, 5, 6, 82, 46, 96, 109, 4, 39, 19, 109, 13, 92, 31, 36, 90, 111, 18, 75, 6, 56, 74, 16, 42, 56, 92, 69, 108, 127, 81, 82, 41, 106, 19, 44, 24, 82, 121, 120, 65, 36, 26, 72, 13, 36, 98, 43, 64, 8, 53, 100, 92, 51, 12, 66, 17, 61, 50, 104, 127, 26, 35, 94, 23, 110, 71, 80, 67, 109, 111, 44, 19, 51, 41, 86, 71, 76, 28, 18, 68, 44, 77, 107, 81, 98, 126, 100, 2, 49, 18, 84, 39, 23, 98, 52, 46, 10, 82, 121, 73]],  # noqa: E231
-            dtype=mindspore.int64,
-        )
-        # fmt: on
-        input_ids = input_ids % self.model_tester.vocab_size
-        input_ids[1] = input_ids[1] - 1
-
-        attention_mask = ops.ones((input_ids.shape))
-        attention_mask[:, :-10] = 0
-
-        config, _, _, _, _, _, _ = self.model_tester.prepare_config_and_inputs()
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        model = BigBirdModel(config).eval()
-
-        with no_grad():
-            hidden_states = model(input_ids, attention_mask=attention_mask).last_hidden_state
-            self.assertTrue(
-                ops.allclose(
-                    hidden_states[0, 0, :5],
-                    mindspore.tensor([1.4825, 0.0774, 0.8226, -0.2962, -0.9593]),
-                    atol=1e-3,
-                )
-            )
-
-    def test_auto_padding(self):
-        self.model_tester.seq_length = 241
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_auto_padding(*config_and_inputs)
-
-    def test_for_change_to_full_attn(self):
-        self.model_tester.seq_length = 9
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_change_to_full_attn(*config_and_inputs)
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    # overwrite from common in order to skip the check on `attentions`
-    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-5, name="outputs", attributes=None):
-        # `bigbird_block_sparse_attention` in `FlaxBigBird` returns `attention_probs = None`, while in PyTorch version,
-        # an effort was done to return `attention_probs` (yet to be verified).
-        if name.startswith("outputs.attentions"):
-            return
-        else:
-            super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
-
-
-@require_mindspore
-@slow
-class BigBirdModelIntegrationTest(unittest.TestCase):
-    # we can have this true once block_sparse attn_probs works accurately
-    test_attention_probs = False
-
-    def _get_dummy_input_ids(self):
-        # fmt: off
-        ids = mindspore.tensor(
-            [[6, 117, 33, 36, 70, 22, 63, 31, 71, 72, 88, 58, 109, 49, 48, 116, 92, 6, 19, 95, 118, 100, 80, 111, 93, 2, 31, 84, 26, 5, 6, 82, 46, 96, 109, 4, 39, 19, 109, 13, 92, 31, 36, 90, 111, 18, 75, 6, 56, 74, 16, 42, 56, 92, 69, 108, 127, 81, 82, 41, 106, 19, 44, 24, 82, 121, 120, 65, 36, 26, 72, 13, 36, 98, 43, 64, 8, 53, 100, 92, 51, 122, 66, 17, 61, 50, 104, 127, 26, 35, 94, 23, 110, 71, 80, 67, 109, 111, 44, 19, 51, 41, 86, 71, 76, 44, 18, 68, 44, 77, 107, 81, 98, 126, 100, 2, 49, 98, 84, 39, 23, 98, 52, 46, 10, 82, 121, 73]],  # noqa: E231
-            dtype=mindspore.int64,
-        )
-        # fmt: on
-        return ids
-
-    def test_inference_block_sparse_pretraining(self):
-        model = BigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base", attention_type="block_sparse")
-
-        input_ids = mindspore.tensor([[20920, 232, 328, 1437] * 1024], dtype=mindspore.int64)
-        with no_grad():
-            outputs = model(input_ids)
-        prediction_logits = outputs.prediction_logits
-        seq_relationship_logits = outputs.seq_relationship_logits
-
-        self.assertEqual(prediction_logits.shape, tuple((1, 4096, 50358)))
-        self.assertEqual(seq_relationship_logits.shape, tuple((1, 2)))
-
-        expected_prediction_logits_slice = mindspore.tensor(
-            [
-                [-0.5583, 0.0475, -0.2508, 7.4423],
-                [0.7409, 1.4460, -0.7593, 7.7010],
-                [1.9150, 3.1395, 5.8840, 9.3498],
-                [-0.1854, -1.4640, -2.2052, 3.7968],
-            ],
-        )
-
-        self.assertTrue(
-            ops.allclose(prediction_logits[0, 128:132, 128:132], expected_prediction_logits_slice, atol=1e-4)
-        )
-
-        expected_seq_relationship_logits = mindspore.tensor([[46.9465, 47.9517]])
-        self.assertTrue(ops.allclose(seq_relationship_logits, expected_seq_relationship_logits, atol=1e-4))
-
-    def test_inference_full_pretraining(self):
-        model = BigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base", attention_type="original_full")
-
-        input_ids = mindspore.tensor([[20920, 232, 328, 1437] * 512], dtype=mindspore.int64)
-        with no_grad():
-            outputs = model(input_ids)
-        prediction_logits = outputs.prediction_logits
-        seq_relationship_logits = outputs.seq_relationship_logits
-
-        self.assertEqual(prediction_logits.shape, tuple((1, 512 * 4, 50358)))
-        self.assertEqual(seq_relationship_logits.shape, tuple((1, 2)))
-
-        expected_prediction_logits_slice = mindspore.tensor(
-            [
-                [0.1499, -1.1217, 0.1990, 8.4499],
-                [-2.7757, -3.0687, -4.8577, 7.5156],
-                [1.5446, 0.1982, 4.3016, 10.4281],
-                [-1.3705, -4.0130, -3.9629, 5.1526],
-            ],
-        )
-        self.assertTrue(
-            ops.allclose(prediction_logits[0, 128:132, 128:132], expected_prediction_logits_slice, atol=1e-4)
-        )
-
-        expected_seq_relationship_logits = mindspore.tensor([[41.4503, 41.2406]])
-        self.assertTrue(ops.allclose(seq_relationship_logits, expected_seq_relationship_logits, atol=1e-4))
-
-    def test_block_sparse_attention_probs(self):
-        """
-        Asserting if outputted attention matrix is similar to hard coded attention matrix
-        """
-
-        if not self.test_attention_probs:
-            self.skipTest("test_attention_probs is set to False")
-
-        model = BigBirdModel.from_pretrained(
-            "google/bigbird-roberta-base", attention_type="block_sparse", num_random_blocks=3, block_size=16
-        )
-        model.eval()
-        config = model.config
-
-        input_ids = self._get_dummy_input_ids()
-
-        hidden_states = model.embeddings(input_ids)
-
-        batch_size, seqlen, _ = hidden_states.size()
-        attn_mask = ops.ones(batch_size, seqlen, dtype=mindspore.float32)
-        to_seq_length = from_seq_length = seqlen
-        from_block_size = to_block_size = config.block_size
-
-        blocked_mask, band_mask, from_mask, to_mask = model.create_masks_for_block_sparse_attn(
-            attn_mask, config.block_size
-        )
-        from_blocked_mask = to_blocked_mask = blocked_mask
-
-        for i in range(config.num_hidden_layers):
-            pointer = model.encoder.layer[i].attention.self
-
-            query_layer = pointer.transpose_for_scores(pointer.query(hidden_states))
-            key_layer = pointer.transpose_for_scores(pointer.key(hidden_states))
-            value_layer = pointer.transpose_for_scores(pointer.value(hidden_states))
-
-            context_layer, attention_probs = pointer.bigbird_block_sparse_attention(
-                query_layer,
-                key_layer,
-                value_layer,
-                band_mask,
-                from_mask,
-                to_mask,
-                from_blocked_mask,
-                to_blocked_mask,
-                pointer.num_attention_heads,
-                pointer.num_random_blocks,
-                pointer.attention_head_size,
-                from_block_size,
-                to_block_size,
-                batch_size,
-                from_seq_length,
-                to_seq_length,
-                seed=pointer.seed,
-                plan_from_length=None,
-                plan_num_rand_blocks=None,
-                output_attentions=True,
-            )
-
-            context_layer = context_layer.view(batch_size, from_seq_length, -1)
-            cl = ops.einsum("bhqk,bhkd->bhqd", attention_probs, value_layer)
-            cl = cl.view(context_layer.size())
-
-            self.assertTrue(ops.allclose(context_layer, cl, atol=0.001))
-
-    def test_block_sparse_context_layer(self):
-        model = BigBirdModel.from_pretrained(
-            "google/bigbird-roberta-base", attention_type="block_sparse", num_random_blocks=3, block_size=16
-        )
-        model.eval()
-        config = model.config
-
-        input_ids = self._get_dummy_input_ids()
-        dummy_hidden_states = model.embeddings(input_ids)
-
-        attn_mask = ops.ones_like(input_ids)
-        blocked_mask, band_mask, from_mask, to_mask = model.create_masks_for_block_sparse_attn(
-            attn_mask, config.block_size
-        )
-
-        targeted_cl = mindspore.tensor(
-            [
-                [0.1870, 1.5248, 0.2333, -0.0483, -0.0952, 1.8359, -0.0142, 0.1239, 0.0083, -0.0045],
-                [-0.0601, 0.1243, 0.1329, -0.1524, 0.2347, 0.0894, -0.2248, -0.2461, -0.0645, -0.0109],
-                [-0.0418, 0.1463, 0.1290, -0.1638, 0.2489, 0.0799, -0.2341, -0.2406, -0.0524, 0.0106],
-                [0.1859, 1.5182, 0.2324, -0.0473, -0.0952, 1.8295, -0.0148, 0.1242, 0.0080, -0.0045],
-                [0.1879, 1.5300, 0.2334, -0.0480, -0.0967, 1.8428, -0.0137, 0.1256, 0.0087, -0.0050],
-                [0.1852, 1.5149, 0.2330, -0.0492, -0.0936, 1.8236, -0.0154, 0.1210, 0.0080, -0.0048],
-                [0.1857, 1.5186, 0.2331, -0.0484, -0.0940, 1.8285, -0.0148, 0.1224, 0.0077, -0.0045],
-                [0.1884, 1.5336, 0.2334, -0.0469, -0.0974, 1.8477, -0.0132, 0.1266, 0.0085, -0.0046],
-                [0.1881, 1.5308, 0.2334, -0.0479, -0.0969, 1.8438, -0.0136, 0.1258, 0.0088, -0.0050],
-                [0.1849, 1.5143, 0.2329, -0.0491, -0.0930, 1.8230, -0.0156, 0.1209, 0.0074, -0.0047],
-                [0.1878, 1.5299, 0.2333, -0.0472, -0.0967, 1.8434, -0.0137, 0.1257, 0.0084, -0.0048],
-                [0.1873, 1.5260, 0.2333, -0.0478, -0.0961, 1.8383, -0.0142, 0.1245, 0.0083, -0.0048],
-                [0.1849, 1.5145, 0.2327, -0.0491, -0.0935, 1.8237, -0.0156, 0.1215, 0.0083, -0.0046],
-                [0.1866, 1.5232, 0.2332, -0.0488, -0.0950, 1.8342, -0.0143, 0.1237, 0.0084, -0.0047],
-            ],
-        )
-
-        context_layer = model.encoder.layer[0].attention.self(
-            dummy_hidden_states,
-            band_mask=band_mask,
-            from_mask=from_mask,
-            to_mask=to_mask,
-            from_blocked_mask=blocked_mask,
-            to_blocked_mask=blocked_mask,
-        )
-        context_layer = context_layer[0]
-
-        self.assertEqual(context_layer.shape, tuple((1, 128, 768)))
-        self.assertTrue(ops.allclose(context_layer[0, 64:78, 300:310], targeted_cl, atol=0.0001))
-
-    def test_tokenizer_inference(self):
-        tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
-        model = BigBirdModel.from_pretrained(
-            "google/bigbird-roberta-base", attention_type="block_sparse", num_random_blocks=3, block_size=16
-        )
-
-        text = [
-            "Transformer-based models are unable to process long sequences due to their self-attention operation,"
-            " which scales quadratically with the sequence length. To address this limitation, we introduce the"
-            " Longformer with an attention mechanism that scales linearly with sequence length, making it easy to"
-            " process documents of thousands of tokens or longer. Longformer’s attention mechanism is a drop-in"
-            " replacement for the standard self-attention and combines a local windowed attention with a task"
-            " motivated global attention. Following prior work on long-sequence transformers, we evaluate Longformer"
-            " on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In"
-            " contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream"
-            " tasks. Our pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new"
-            " state-of-the-art results on WikiHop and TriviaQA."
-        ]
-        inputs = tokenizer(text)
-
-        for k in inputs:
-            inputs[k] = mindspore.tensor(inputs[k], dtype=mindspore.int64)
-
-        prediction = model(**inputs)
-        prediction = prediction[0]
-
-        self.assertEqual(prediction.shape, tuple((1, 199, 768)))
-
-        expected_prediction = mindspore.tensor(
-            [
-                [0.1887, -0.0474, 0.2604, 0.1453],
-                [0.0651, 0.1999, 0.1797, 0.1161],
-                [0.2833, -0.3036, 0.6910, 0.1123],
-                [0.2836, -0.4644, -0.0111, 0.1530],
-                [0.3919, -0.2823, 0.4192, 0.1687],
-                [0.2168, -0.1956, 0.4050, 0.0925],
-                [0.2597, -0.0884, 0.1258, 0.1119],
-                [0.1127, -0.1203, 0.1924, 0.2859],
-                [0.1362, -0.1315, 0.2693, 0.1027],
-                [-0.3169, -0.2266, 0.4419, 0.6740],
-                [0.2366, -0.1452, 0.2589, 0.0579],
-                [0.0358, -0.2021, 0.3112, -0.1392],
-            ],
-        )
-
-        self.assertTrue(ops.allclose(prediction[0, 52:64, 320:324], expected_prediction, atol=1e-4))
-
-    def test_inference_question_answering(self):
-        tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-base-trivia-itc")
-        model = BigBirdForQuestionAnswering.from_pretrained(
-            "google/bigbird-base-trivia-itc", attention_type="block_sparse", block_size=16, num_random_blocks=3
-        )
-
-        context = (
-            "The BigBird model was proposed in Big Bird: Transformers for Longer Sequences by Zaheer, Manzil and"
-            " Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon, Santiago"
-            " and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a"
-            " sparse-attention based transformer which extends Transformer based models, such as BERT to much longer"
-            " sequences. In addition to sparse attention, BigBird also applies global attention as well as random"
-            " attention to the input sequence. Theoretically, it has been shown that applying sparse, global, and"
-            " random attention approximates full attention, while being computationally much more efficient for longer"
-            " sequences. As a consequence of the capability to handle longer context, BigBird has shown improved"
-            " performance on various long document NLP tasks, such as question answering and summarization, compared"
-            " to BERT or RoBERTa."
-        )
-
-        question = [
-            "Which is better for longer sequences- BigBird or BERT?",
-            "What is the benefit of using BigBird over BERT?",
-        ]
-        inputs = tokenizer(
-            question,
-            [context, context],
-            padding=True,
-            return_tensors="pt",
-            add_special_tokens=True,
-            max_length=256,
-            truncation=True,
-        )
-
-        inputs = {k: v for k, v in inputs.items()}
-
-        start_logits, end_logits = model(**inputs).to_tuple()
-
-        # fmt: off
-        target_start_logits = mindspore.tensor(
-            [[-8.5622, -9.6209, -14.3351, -8.7032, -11.8596, -7.7446, -9.6730, -13.6063, -8.9651, -11.7417, -8.2641, -8.7056, -13.4116, -5.6600, -8.8316, -10.4148, -12.2180, -7.7979, -12.5274, -6.0685, -10.3373, -11.3128, -6.6456, -14.4030, -6.8292, -14.5383, -11.5638, -6.3326, 11.5293, -1.8434, -10.0013, -7.6150], [-10.7384, -13.1179, -10.1837, -13.7700, -10.0186, -11.7335, -13.3411, -10.0188, -13.4235, -9.9381, -10.4252, -13.1281, -8.2022, -10.4326, -11.5542, -14.1549, -10.7546, -13.4691, -8.2744, -11.4324, -13.3773, -9.8284, -14.5825, -8.7471, -14.7050, -8.0364, -11.3627, -6.4638, -11.7031, -14.3446, -9.9425, -8.0088]], # noqa: E231
-        )
-
-        target_end_logits = mindspore.tensor(
-            [[-12.1736, -8.8487, -14.8877, -11.6713, -15.1165, -12.2396, -7.6828, -15.4153, -12.2528, -14.3671, -12.3596, -7.4272, -14.9615, -13.6356, -11.7939, -9.9767, -14.8112, -8.9567, -15.8798, -11.5291, -9.4249, -14.7544, -7.9387, -16.2789, -8.9702, -15.3111, -11.5585, -7.9992, -4.1127, 10.3209, -8.3926, -10.2005], [-11.1375, -15.4027, -12.6861, -16.9884, -13.7093, -10.3560, -15.7228, -12.9290, -15.8519, -13.7953, -10.2460, -15.7198, -14.2078, -12.8477, -11.4861, -16.1017, -11.8900, -16.4488, -13.2959, -10.3980, -15.4874, -10.3539, -16.8263, -10.9973, -17.0344, -9.2751, -10.1196, -13.8907, -12.1025, -13.0628, -12.8530, -13.8173]], # noqa: E321
-        )
-        # fmt: on
-
-        self.assertTrue(ops.allclose(start_logits[:, 64:96], target_start_logits, atol=1e-4))
-        self.assertTrue(ops.allclose(end_logits[:, 64:96], target_end_logits, atol=1e-4))
-
-        input_ids = inputs["input_ids"].tolist()
-        answer = [
-            input_ids[i][ops.argmax(start_logits, dim=-1)[i] : ops.argmax(end_logits, dim=-1)[i] + 1]
-            for i in range(len(input_ids))
-        ]
-        answer = tokenizer.batch_decode(answer)
-
-        self.assertTrue(answer == ["BigBird", "global attention"])
-
-    def test_fill_mask(self):
-        tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
-        model = BigBirdForMaskedLM.from_pretrained("google/bigbird-roberta-base")
-
-        input_ids = tokenizer("The goal of life is [MASK] .", return_tensors="pt").input_ids
-        logits = model(input_ids).logits
-
-        # [MASK] is token at 6th position
-        pred_token = tokenizer.decode(ops.argmax(logits[0, 6:7], axis=-1))
-        self.assertEqual(pred_token, "happiness")
-
-    def test_auto_padding(self):
-        model = BigBirdModel.from_pretrained(
-            "google/bigbird-roberta-base", attention_type="block_sparse", num_random_blocks=3, block_size=16
-        )
-        model.eval()
-
-        input_ids = mindspore.tensor([200 * [10] + 40 * [2] + [1]], dtype=mindspore.int64)
-        with no_grad():
-            output = model(input_ids).to_tuple()[0]
-
-        # fmt: off
-        target = mindspore.tensor(
-            [[-0.129420, -0.164740, 0.042422, -0.336030, 0.094379, 0.033794, 0.384590, 0.229660, -0.196500, 0.108020], [-0.000154, -0.168800, 0.165820, -0.313670, 0.101240, 0.035145, 0.381880, 0.213730, -0.201080, 0.077443], [0.053754, -0.166350, 0.225520, -0.272900, 0.119670, 0.019987, 0.348670, 0.199190, -0.181600, 0.084640], [0.063636, -0.187110, 0.237010, -0.297380, 0.126300, 0.020025, 0.268490, 0.191820, -0.192300, 0.035077], [0.073893, -0.184790, 0.188870, -0.297860, 0.134280, 0.028972, 0.174650, 0.186890, -0.180530, 0.006851], [0.005253, -0.169360, 0.123100, -0.302550, 0.126930, 0.024188, 0.133410, 0.200600, -0.168210, -0.001006], [-0.093336, -0.175370, -0.004768, -0.333170, 0.114330, 0.034168, 0.120960, 0.203570, -0.162810, -0.005757], [-0.160210, -0.169310, -0.049064, -0.331950, 0.115730, 0.027062, 0.143600, 0.205310, -0.144580, 0.026746], [-0.193200, -0.156820, -0.079422, -0.351600, 0.106450, 0.032174, 0.245690, 0.210250, -0.173480, 0.043914], [-0.167980, -0.153050, -0.059764, -0.357890,0.103910, 0.031481, 0.334190, 0.208960,-0.178180, 0.072165], [-0.136990, -0.156950, -0.012099, -0.353140,0.096996, 0.025864, 0.376340, 0.216050, -0.171820, 0.089963], [-0.041143, -0.167060, 0.079754, -0.353220, 0.093247, 0.019867, 0.385810, 0.214340, -0.191800, 0.065946],[0.040373, -0.158610, 0.152570, -0.312930, 0.110590, 0.012282, 0.345270, 0.204040, -0.176500, 0.064972], [0.043762, -0.166450, 0.179500, -0.317930, 0.117280, -0.004040, 0.304490, 0.201380, -0.182780, 0.044000]], # noqa: E231
-        )
-        # fmt: on
-
-        self.assertEqual(output.shape, tuple((1, 241, 768)))
-        self.assertTrue(ops.allclose(output[0, 64:78, 300:310], target, atol=0.0001))
diff --git a/tests/transformers/models/bigbird_pegasus/__init__.py b/tests/transformers/models/bigbird_pegasus/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/transformers/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
deleted file mode 100644
index 4c1f4bfed..000000000
--- a/tests/transformers/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
+++ /dev/null
@@ -1,816 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore BigBirdPegasus model."""
-
-import copy
-import tempfile
-import unittest
-
-from mindnlp.transformers import BigBirdPegasusConfig, is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import (
-        BigBirdPegasusForCausalLM,
-        BigBirdPegasusForConditionalGeneration,
-        BigBirdPegasusForQuestionAnswering,
-        BigBirdPegasusForSequenceClassification,
-        BigBirdPegasusModel,
-        PegasusTokenizer,
-    )
-    from mindnlp.transformers.models.bigbird_pegasus.modeling_bigbird_pegasus import (
-        BigBirdPegasusDecoder,
-        BigBirdPegasusEncoder,
-    )
-
-MODEL_ID = "google/bigbird-pegasus-large-pubmed"
-
-def prepare_bigbird_pegasus_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-
-    input_dict = {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-    }
-    input_dict = {k: input_dict[k] for k in input_dict}
-    return input_dict
-
-
-class BigBirdPegasusModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        seq_length=256,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=31,
-        hidden_act="gelu_fast",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=260,
-        eos_token_id=1,
-        pad_token_id=0,
-        bos_token_id=2,
-        attention_type="block_sparse",
-        use_bias=False,
-        block_size=16,
-        num_random_blocks=3,
-        scale_embedding=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-        self.attention_type = attention_type
-        self.use_bias = use_bias
-        self.block_size = block_size
-        self.num_random_blocks = num_random_blocks
-        self.scale_embedding = scale_embedding
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-        inputs_dict = prepare_bigbird_pegasus_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def get_config(self):
-        return BigBirdPegasusConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            attention_type=self.attention_type,
-            use_bias=self.use_bias,
-            block_size=self.block_size,
-            num_random_blocks=self.num_random_blocks,
-            scale_embedding=self.scale_embedding,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = BigBirdPegasusModel(config=config).get_decoder().eval()
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask.to(attention_mask.dtype)], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = BigBirdPegasusModel(config=config).eval()
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = BigBirdPegasusEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
-            0
-        ]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = BigBirdPegasusDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-    def create_and_check_model(self, config, inputs_dict):
-        model = BigBirdPegasusModel(config=config).eval()
-        input_ids = inputs_dict["input_ids"]
-        decoder_input_ids = inputs_dict["decoder_input_ids"]
-        result = model(input_ids, decoder_input_ids=decoder_input_ids, use_cache=True)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-
-@require_mindspore
-class BigBirdPegasusModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            BigBirdPegasusModel,
-            BigBirdPegasusForConditionalGeneration,
-            BigBirdPegasusForSequenceClassification,
-            BigBirdPegasusForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (BigBirdPegasusForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": BigBirdPegasusModel,
-            "question-answering": BigBirdPegasusForQuestionAnswering,
-            "summarization": BigBirdPegasusForConditionalGeneration,
-            "text-classification": BigBirdPegasusForSequenceClassification,
-            "text-generation": BigBirdPegasusForCausalLM,
-            "text2text-generation": BigBirdPegasusForConditionalGeneration,
-            "translation": BigBirdPegasusForConditionalGeneration,
-            "zero-shot": BigBirdPegasusForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_missing_keys = False
-    test_pruning = False
-    test_head_masking = False
-
-    # torchscript tests are not passing for now.
-    # Also torchscript is not an important feature to have in the beginning.
-    test_torchscript = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
-            return True
-
-        return False
-
-    # overwrite from GenerationTesterMixin to solve problem
-    # with conflicting random seeds
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.attention_type = "original_full"
-
-        input_ids = inputs_dict.pop(self.input_name)
-        _ = inputs_dict.pop("attention_mask", None)
-        _ = inputs_dict.pop("decoder_input_ids", None)
-        _ = inputs_dict.pop("decoder_attention_mask", None)
-        attention_mask = ops.ones_like(input_ids, dtype=mindspore.int64)
-
-        # cut to half length & take max batch_size 3
-        sequence_length = input_ids.shape[-1] // 2
-        input_ids = input_ids[:batch_size, :sequence_length]
-        attention_mask = attention_mask[:batch_size, :sequence_length]
-
-        if config.eos_token_id is not None and config.pad_token_id is None:
-            # hack to allow generate for models such as GPT2 as is done in `generate()`
-            config.pad_token_id = config.eos_token_id
-        return config, input_ids, attention_mask, inputs_dict
-
-
-    def setUp(self):
-        self.model_tester = BigBirdPegasusModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BigBirdPegasusConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    def test_model_various_attn_type(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["original_full", "block_sparse"]:
-            config_and_inputs[0].attention_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_generate_without_input_ids(self):
-        if self.model_tester.attention_type == "block_sparse":
-            self.skipTest(
-                "Cannot pass for BigBird-block-sparse attention since input_ids must be multiple of block_size"
-            )
-        super().test_generate_without_input_ids()
-
-    def test_retain_grad_hidden_states_attentions(self):
-        if self.model_tester.attention_type == "block_sparse":
-            # this test can't pass since attention matrix (which is getting returned) can't have gradients (& just 0 at many locations)
-            self.skipTest(reason="Cannot pass since returned attention matrix can't have gradients")
-        super().test_retain_grad_hidden_states_attentions()
-
-    # BigBirdPegasusForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (
-            BigBirdPegasusModel,
-            BigBirdPegasusForConditionalGeneration,
-            BigBirdPegasusForQuestionAnswering,
-        ):
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with no_grad():
-                model(**inputs)[0]
-
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_dict.pop("decoder_attention_mask")
-        input_dict.pop("decoder_input_ids")
-        model = BigBirdPegasusForConditionalGeneration(config).eval()
-        model.half()
-        model.generate(**input_dict)
-        model.generate(**input_dict, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    @slow
-    def test_batched_forward_original_full(self):
-        self._check_batched_forward(attn_type="original_full")
-
-    @slow
-    def test_batched_forward_block_sparse(self):
-        self._check_batched_forward(attn_type="block_sparse", tolerance=1e-1)
-
-    def _check_batched_forward(self, attn_type, tolerance=1e-3):
-        config, _ = self.model_tester.prepare_config_and_inputs()
-        config.max_position_embeddings = 128
-        config.block_size = 16
-        config.attention_type = attn_type
-        model = BigBirdPegasusForConditionalGeneration(config)
-        model.eval()
-
-        chunk_length = 32
-
-        sample_with_padding = [3, 8, 11] * chunk_length + [0] * chunk_length
-        sample_without_padding = [4, 7, 9, 13] * chunk_length
-        target_ids_without_padding = [2, 3] * 8
-        target_ids_with_padding = [7, 8] * 6 + 4 * [-100]
-
-        attention_mask = mindspore.tensor(
-            [[1] * 3 * chunk_length + [0] * chunk_length, [1] * 4 * chunk_length],
-            dtype=mindspore.int64,
-        )
-
-        input_ids = mindspore.tensor([sample_with_padding, sample_without_padding], dtype=mindspore.int64)
-        labels = mindspore.tensor(
-            [target_ids_without_padding, target_ids_with_padding], dtype=mindspore.int64
-        )
-
-        with no_grad():
-            logits_batched = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).logits
-
-        with no_grad():
-            logits_single_first = model(input_ids=input_ids[:1, :-chunk_length], labels=labels[:1]).logits
-
-        self.assertTrue(ops.allclose(logits_batched[0, -3:], logits_single_first[0, -3:], atol=tolerance))
-
-        with no_grad():
-            logits_single_second = model(input_ids=input_ids[1:], labels=labels[1:, :-4]).logits
-
-        self.assertTrue(ops.allclose(logits_batched[1, :3], logits_single_second[0, :3], atol=tolerance))
-
-    def test_auto_padding(self):
-        ids = [[7, 6, 9] * 65]
-        config, _ = self.model_tester.prepare_config_and_inputs()
-        input_ids = mindspore.tensor(ids, dtype=mindspore.int64)
-        attention_mask = input_ids.new_ones(input_ids.shape)
-        decoder_input_ids = mindspore.tensor([[33, 5, 8] * 3], dtype=mindspore.int64)
-
-        config.block_size = 8
-        model = BigBirdPegasusForConditionalGeneration(config).eval()
-        output1 = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[
-            "logits"
-        ]
-
-        ids = [[7, 6, 9] * 65 + [0] * 5]
-        input_ids = mindspore.tensor(ids, dtype=mindspore.int64)
-        attention_mask = mindspore.tensor([[1] * 3 * 65 + [0] * 5], dtype=mindspore.int64)
-        output2 = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[
-            "logits"
-        ]
-
-        self.assertTrue(ops.allclose(output1, output2, atol=1e-5))
-
-    def test_for_change_to_full_attn(self):
-        self.model_tester.seq_length = 9
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-
-        # automatic switch will happen
-        config.attention_type = "block_sparse"
-        model = BigBirdPegasusForConditionalGeneration(config).eval()
-        state_dict = model.state_dict()
-        outputs1 = model(**input_dict)["logits"]
-
-        config.attention_type = "original_full"
-        model = BigBirdPegasusForConditionalGeneration(config).eval()
-        model.load_state_dict(state_dict)
-        outputs2 = model(**input_dict)["logits"]
-
-        self.assertTrue(ops.allclose(outputs1, outputs2, atol=1e-5))
-
-    @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
-    )
-    def test_load_save_without_tied_weights(self):
-        pass
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-@slow
-class BigBirdPegasusModelIntegrationTests(unittest.TestCase):
-    def _get_dummy_input_ids(self):
-        # fmt: off
-        ids = mindspore.tensor(
-            [[685, 560, 630, 193, 836, 764, 708, 360, 10, 724, 278, 755, 805, 600, 71, 473, 601, 397, 315, 706, 487, 552, 88, 175, 601, 850, 678, 538, 846, 73, 778, 917, 116, 977, 756, 710, 1023, 848, 432, 449, 851, 100, 985, 178, 756, 798, 660, 148, 911, 424, 289, 962, 266, 698, 640, 545, 544, 715, 245, 152, 676, 511, 460, 883, 184, 29, 803, 129, 129, 933, 54, 902, 551, 489, 757, 274, 336, 389, 618, 43, 443, 544, 889, 258, 322, 1000, 938, 58, 292, 871, 120, 780, 431, 83, 92, 897, 399, 612, 566, 909, 634, 939, 85, 204, 325, 775, 965, 48, 640, 1013, 132, 973, 869, 181, 1001, 847, 144, 661, 228, 955, 792, 720, 910, 374, 854, 561, 306, 582, 170, 676, 449, 96, 198, 607, 257, 882, 691, 293, 931, 817, 862, 388, 611, 555, 974, 369, 1000, 918, 202, 384, 513, 907, 371, 556, 955, 384, 24, 700, 131, 378, 99, 575, 932, 735, 124, 964, 595, 943, 740, 149, 210, 563, 412, 783, 42, 59, 706, 37, 779, 87, 44, 873, 12, 771, 308, 81, 33, 183, 129, 807, 276, 175, 555, 372, 185, 445, 489, 590, 287, 281, 638, 771, 516, 95, 227, 876, 270, 881, 297, 329, 20, 608, 841, 411, 451, 249, 181, 324, 1005, 830, 783, 865, 261, 964, 750, 140, 1021, 599, 462, 890, 622, 844, 697, 529, 153, 926, 150, 111, 26, 465, 957, 890, 887, 118, 446, 596, 674, 873, 929, 229, 508, 764, 122, 327, 470, 288, 526, 840, 697, 153, 592, 42, 275, 553, 439, 208, 780, 167, 112, 350, 1018, 130, 736, 887, 813, 217, 382, 25, 68, 979, 1008, 772, 235, 717, 999, 292, 727, 1023, 702, 710, 728, 556, 33, 12, 617, 213, 139, 695, 1004, 422, 638, 669, 624, 489, 771, 540, 980, 218, 664, 822, 308, 175, 149, 950, 542, 580, 548, 808, 394, 74, 298, 920, 900, 815, 731, 947, 877, 772, 800, 778, 395, 540, 430, 200, 424, 62, 342, 866, 45, 803, 931, 89, 34, 646, 233, 768, 37, 769, 460, 291, 198, 895, 950, 255, 81, 447, 137, 190, 130, 210, 369, 292, 377, 348, 169, 885, 805, 177, 538, 324, 872, 509, 804, 115, 799, 30, 754, 290, 147, 274, 222, 341, 510, 515, 70, 358, 909, 557, 886, 766, 323, 624, 92, 342, 424, 552, 972, 663, 415, 658, 711, 968, 275, 861, 44, 84, 434, 810, 94, 175, 406, 202, 858, 499, 481, 988, 330, 541, 1004, 210, 618, 955, 897, 983, 576, 17, 107, 165, 607, 537, 629, 192, 196, 308, 137, 953, 860, 94, 892, 751, 88, 161, 148, 585, 456, 88, 14, 315, 594, 121, 885, 952, 833, 716, 733, 933, 282, 801, 427, 783, 471, 285, 277, 979, 325, 535, 228, 891, 596, 648, 969, 574, 654, 518, 257, 137, 208, 464, 950, 140, 5, 424, 349, 942, 283, 587, 821, 1007, 434, 220, 820, 740, 874, 787, 374, 291, 564, 671, 438, 827, 940, 824, 509, 1021, 787, 942, 856, 450, 327, 491, 54, 817, 95, 60, 337, 667, 637, 164, 571, 946, 107, 202, 301, 782, 890, 839, 551, 680, 649, 14, 1017, 904, 721, 1017, 535, 505, 848, 986, 777, 740, 775, 210, 456, 469, 474, 963, 573, 401, 57, 883, 750, 664, 281, 5, 613, 1005, 306, 344, 543, 567, 154, 789, 354, 358, 698, 408, 412, 30, 930, 372, 822, 632, 948, 855, 503, 8, 618, 1010, 138, 695, 897, 852, 377, 933, 722, 149, 886, 1009, 260, 127, 811, 578, 533, 805, 325, 977, 113, 944, 651, 238, 361, 991, 860, 556, 64, 928, 917, 455, 266, 445, 604, 624, 420, 340, 845, 275, 370, 843, 227, 226, 940, 644, 909, 229, 827, 898, 370, 129, 808, 25, 699, 293, 356, 838, 135, 4, 227, 890, 681, 445, 418, 285, 837, 27, 737, 249, 366, 948, 202, 438, 198, 930, 648, 638, 607, 73, 247, 853, 136, 708, 214, 476, 621, 324, 103, 853, 328, 596, 224, 257, 646, 348, 108, 927, 970, 980, 520, 150, 998, 477, 393, 684, 559, 1, 361, 692, 551, 90, 75, 500, 739, 636, 344, 97, 852, 283, 719, 33, 116, 455, 866, 429, 828, 826, 691, 174, 746, 133, 442, 94, 348, 402, 420, 707, 405, 942, 186, 976, 376, 677, 874, 703, 517, 498, 499, 206, 415, 366, 856, 739, 420, 586, 219, 952, 539, 375, 23, 461, 720, 355, 603, 52, 999, 815, 721, 574, 445, 816, 1019, 105, 641, 395, 972, 910, 328, 607, 519, 686, 246, 415, 528, 170, 167, 310, 940, 595, 392, 221, 834, 682, 835, 115, 861, 335, 742, 220, 247, 101, 416, 222, 179, 509, 175, 606, 627, 674, 781, 737, 746, 849, 67, 457, 1012, 126, 139, 625, 731, 156, 697, 121, 322, 449, 710, 857, 291, 976, 4, 701, 239, 678, 172, 724, 857, 583, 661, 903, 797, 628, 903, 835, 605, 989, 615, 870, 380, 710, 110, 330, 101, 695, 846, 918, 508, 672, 594, 36, 238, 244, 251, 393, 767, 282, 22, 430, 230, 983, 401, 154, 1007, 120, 678, 896, 386, 390, 711, 397, 347, 587, 1020, 951, 79, 831, 585, 200, 814, 134, 560, 700, 171, 452, 139, 755, 314, 476, 346, 388, 126, 719, 851, 198, 699, 901, 18, 710, 448, 351, 665, 644, 326, 425, 165, 571, 178, 440, 665, 674, 915, 866, 463, 754, 136, 950, 748, 47, 497, 1013, 640, 930, 338, 158, 525, 631, 815, 887, 289, 803, 116, 600, 637, 410, 175, 499, 876, 565, 1002, 623, 577, 333, 887, 586, 147, 773, 776, 644, 49, 77, 294, 117, 494, 561, 110, 979, 180, 562, 72, 859, 434, 1007, 286, 516, 75, 597, 491, 322, 888, 533, 209, 43, 499, 29, 411, 856, 181, 305, 963, 615, 778, 259, 373, 877, 746, 858, 381, 886, 613, 91, 69, 618, 523, 13, 617, 226, 422, 168, 929, 379, 290, 923, 100, 218, 307, 345, 211, 789, 735, 669, 585, 275, 410, 921, 552, 235, 636, 285, 665, 659, 708, 173, 724, 302, 823, 1, 139, 708, 903, 732, 868, 442, 967, 916, 163, 51, 243, 871]],  # noqa: E231
-            dtype=mindspore.int64,
-        )
-        # fmt: on
-        return ids
-
-    def _get_dummy_target_ids(self):
-        # fmt: off
-        ids = mindspore.tensor(
-            [[13, 6, 1, 4, 12, 4, 8, 10, 4, 6, 3, 5, 8, 7, 9, 9]],  # noqa: E231
-            dtype=mindspore.int64,
-        )
-        # fmt: on
-        return ids
-
-    def test_inference_block_sparse(self):
-        model = BigBirdPegasusForConditionalGeneration.from_pretrained(
-            MODEL_ID, attention_type="block_sparse", block_size=16, num_random_blocks=3
-        )
-
-        input_ids = self._get_dummy_input_ids()
-        target_ids = self._get_dummy_target_ids()
-
-        outputs = model(input_ids, labels=target_ids)
-        prediction_logits = outputs.logits
-
-        self.assertEqual(prediction_logits.shape, (1, 16, 96103))
-        # fmt: off
-        expected_prediction_logits_slice = mindspore.tensor(
-            [[1.5118, 5.5227, 4.8125, 1.7603, 8.1704, 3.996, 4.8118, 6.7806, 2.2297, 6.9834, 3.1906, 0.103, 7.1515, 6.3679, 3.1896, 6.3054, 3.9741, 6.3772, 5.0042, -0.6338, 6.7868, 0.592, 0.5363, 1.87, -0.331, -2.4518, 1.8263, 3.1899], [1.5702, 5.8135, 4.6675, 2.3674, 8.9828, 3.7913, 5.4027, 7.6567, 1.9007, 7.3706, 3.8824, 0.0247, 7.6094, 6.6985, 3.2826, 7.0094, 3.8713, 5.6555, 5.0439, -0.3519, 7.1525, 0.4062, -0.2419, 2.2194, -0.6447, -2.9614, 2.0713, 3.248], [1.4527, 5.6003, 4.5381, 2.6382, 9.2809, 3.2969, 5.6811, 8.4011, 1.6909, 7.4937, 4.3185, -0.0878, 7.61, 6.6822, 3.4753, 7.3962, 3.5336, 4.9216, 4.943, -0.2043, 7.3326, 0.2199, -0.6016, 2.4367, -0.7043, -3.0689, 2.3215, 3.0611], [1.1084, 5.6308, 4.4886, 2.717, 9.4103, 3.0733, 5.5825, 8.4325, 1.3075, 7.5495, 4.4782, -0.1092, 7.8115, 6.6285, 3.5311, 7.6853, 3.509, 4.4994, 4.9224, -0.1384, 7.3069, -0.0473, -0.8578, 2.4632, -0.5249, -3.4627, 2.2671, 2.8818]],  # noqa: E231
-        )
-
-        # fmt: on
-        self.assertTrue(
-            ops.allclose(prediction_logits[0, 4:8, 128:156], expected_prediction_logits_slice, atol=1e-4)
-        )
-
-    def test_inference_full_attn(self):
-        model = BigBirdPegasusForConditionalGeneration.from_pretrained(MODEL_ID, attention_type="original_full")
-
-        input_ids = self._get_dummy_input_ids()
-        target_ids = self._get_dummy_target_ids()
-
-        outputs = model(input_ids, labels=target_ids)
-        prediction_logits = outputs.logits
-
-        self.assertEqual(prediction_logits.shape, (1, 16, 96103))
-        # fmt: off
-        expected_prediction_logits_slice = mindspore.tensor(
-            [[1.3418, 5.8304, 6.5662, 2.0448, 8.7702, 4.6579, 4.9947, 6.429, 2.4296, 7.9431, 4.217, 0.0672, 7.334, 5.1966, 2.9603, 6.0814, 4.6756, 7.5522, 5.076, 0.213, 6.6638, 0.6577, 0.244, 2.1221, 0.7531, -2.4076, 1.8731, 3.5594], [1.5525, 6.0524, 6.309, 2.6245, 9.229, 4.5213, 5.0913, 7.0622, 1.7992, 8.0962, 4.7994, -0.0248, 7.7168, 5.5878, 3.0883, 6.5248, 4.7895, 6.9974, 4.8787, 0.5445, 6.6686, 0.0102, -0.1659, 2.6195, 0.7389, -2.8956, 1.9928, 3.3777], [1.6407, 6.2104, 6.0331, 2.8076, 9.4074, 3.9772, 5.0574, 7.5316, 1.4201, 8.3035, 5.0212, -0.1031, 7.553, 5.5023, 3.1427, 6.7674, 4.4409, 6.457, 4.525, 0.728, 6.5422, -0.6234, -0.4726, 2.7486, 0.6985, -3.0804, 1.9669, 3.2365], [1.5065, 6.1271, 5.8296, 2.8405, 9.5649, 3.6834, 5.1214, 7.546, 0.9758, 8.3335, 5.1952, -0.1395, 7.4348, 5.6893, 3.2942, 7.0356, 4.1665, 5.9695, 4.3898, 0.8931, 6.3988, -0.8957, -0.7522, 2.8924, 0.6498, -3.4358, 1.8654, 2.9735]],  # noqa: E231
-        )
-        # fmt: on
-        self.assertTrue(
-            ops.allclose(prediction_logits[0, 4:8, 128:156], expected_prediction_logits_slice, atol=1e-4)
-        )
-
-    def test_seq_to_seq_generation(self):
-        MODEL_ID = "google/bigbird-pegasus-large-arxiv"
-        model = BigBirdPegasusForConditionalGeneration.from_pretrained(MODEL_ID)
-        tokenizer = PegasusTokenizer.from_pretrained(MODEL_ID)
-
-        ARTICLE_LEP = r"""the lep experiments at the resonance of @xmath1-boson have tested the standard model ( sm ) at quantum level , measuring the @xmath1-decay into fermion pairs with an accuracy of one part in ten thousands . the good agreement of the lep data with the sm predictions have severely constrained the behavior of new physics at the @xmath1-pole . taking these achievements into account one can imagine that the physics of @xmath1-boson will again play the central role in the frontier of particle physics if the next generation @xmath1 factory comes true with the generated @xmath1 events several orders of magnitude higher than that of the lep . this factory can be realized in the gigaz option of the international linear collider ( ilc)@xcite . the ilc is a proposed electron - positron collider with tunable energy ranging from @xmath12 to @xmath13 and polarized beams in its first phase , and the gigaz option corresponds to its operation on top of the resonance of @xmath1 boson by adding a bypass to its main beam line . given the high luminosity , @xmath14 , and the cross section at the resonance of @xmath1 boson , @xmath15 , about @xmath16 @xmath1 events can be generated in an operational year of @xmath17 of gigaz , which implies that the expected sensitivity to the branching ratio of @xmath1-decay can be improved from @xmath18 at the lep to @xmath19 at the gigaz@xcite . in light of this , the @xmath1-boson properties , especially its exotic or rare decays which are widely believed to be sensitive to new physics , should be investigated comprehensively to evaluate their potential in probing new physics .    among the rare @xmath1-decays , the flavor changing ( fc ) processes were most extensively studied to explore the flavor texture in new physics @xcite , and it was found that , although these processes are severely suppressed in the sm , their branching ratios in new physics models can be greatly enhanced to @xmath19 for lepton flavor violation decays @xcite and @xmath20 for quark flavor violation decays @xcite . besides the fc processes , the @xmath1-decay into light higgs boson(s ) is another type of rare process that was widely studied , e.g. the decay @xmath21 ( @xmath22 ) with the particle @xmath0 denoting a light higgs boson was studied in @xcite , the decay @xmath23 was studied in the two higgs doublet model ( 2hdm)@xcite and the minimal supersymmetric standard model ( mssm)@xcite , and the decay @xmath4 was studied in a model independent way @xcite , in 2hdm@xcite and also in mssm@xcite . these studies indicate that , in contrast with the kinematic forbidden of these decays in the sm , the rates of these decays can be as large as @xmath18 in new physics models , which lie within the expected sensitivity of the gigaz . in this work , we extend the previous studies of these decays to some new models and investigate these decays altogether . we are motivated by some recent studies on the singlet extension of the mssm , such as the next - to - minimal supersymmetric standard model ( nmssm ) @xcite and the nearly minimal supersymmetric standard model ( nmssm ) @xcite , where a light cp - odd higgs boson @xmath0 with singlet - dominant component may naturally arise from the spontaneous breaking of some approximate global symmetry like @xmath24 or peccei - quuin symmetry @xcite . these non - minimal supersymmetric models can not only avoid the @xmath25-problem , but also alleviate the little hierarchy by having such a light higgs boson @xmath0 @xcite . we are also motivated by that , with the latest experiments , the properties of the light higgs boson are more stringently constrained than before . so it is worth updating the previous studies . so far there is no model - independent lower bound on the lightest higgs boson mass . in the sm , it must be heavier than @xmath26 gev , obtained from the null observation of the higgs boson at lep experiments . however , due to the more complex structure of the higgs sector in the extensions of the sm , this lower bound can be significantly relaxed according to recent studies , e.g. , for the cp - odd higgs boson @xmath0 we have @xmath27 gev in the nmssm @xcite , @xmath28 gev in the nmssm @xcite , and @xmath29 gev in the lepton - specific 2hdm ( l2hdm ) @xcite . with such a light cp - odd higgs boson , the z - decay into one or more @xmath0 is open up . noting that the decay @xmath30 is forbidden due to bose symmetry , we in this work study the rare @xmath1-decays @xmath6 ( @xmath22 ) , @xmath31 and @xmath4 in a comparative way for four models , namely the type - ii 2hdm@xcite , the l2hdm @xcite , the nmssm and the nmssm . in our study , we examine carefully the constraints on the light @xmath0 from many latest experimental results . this work is organized as follows . in sec . ii we briefly describe the four new physics models . in sec . iii we present the calculations of the rare @xmath1-decays . in sec . iv we list the constraints on the four new physics models . in sec . v we show the numerical results for the branching ratios of the rare @xmath1-decays in various models . finally , the conclusion is given in sec . as the most economical way , the sm utilizes one higgs doublet to break the electroweak symmetry . as a result , the sm predicts only one physical higgs boson with its properties totally determined by two free parameters . in new physics models , the higgs sector is usually extended by adding higgs doublets and/or singlets , and consequently , more physical higgs bosons are predicted along with more free parameters involved in . the general 2hdm contains two @xmath32 doublet higgs fields @xmath33 and @xmath34 , and with the assumption of cp - conserving , its scalar potential can be parameterized as@xcite : @xmath35,\end{aligned}\ ] ] where @xmath36 ( @xmath37 ) are free dimensionless parameters , and @xmath38 ( @xmath39 ) are the parameters with mass dimension . after the electroweak symmetry breaking , the spectrum of this higgs sector includes three massless goldstone modes , which become the longitudinal modes of @xmath40 and @xmath1 bosons , and five massive physical states : two cp - even higgs bosons @xmath41 and @xmath42 , one neutral cp - odd higgs particle @xmath0 and a pair of charged higgs bosons @xmath43 . noting the constraint @xmath44 with @xmath45 and @xmath46 denoting the vacuum expectation values ( vev ) of @xmath33 and @xmath34 respectively , we choose @xmath47 as the input parameters with @xmath48 , and @xmath49 being the mixing angle that diagonalizes the mass matrix of the cp - even higgs fields . the difference between the type - ii 2hdm and the l2hdm comes from the yukawa coupling of the higgs bosons to quark / lepton . in the type - ii 2hdm , one higgs doublet @xmath34 generates the masses of up - type quarks and the other doublet @xmath33 generates the masses of down - type quarks and charged leptons ; while in the l2hdm one higgs doublet @xmath33 couples only to leptons and the other doublet @xmath34 couples only to quarks . so the yukawa interactions of @xmath0 to fermions in these two models are given by @xcite @xmath50 with @xmath51 denoting generation index . obviously , in the type - ii 2hdm the @xmath52 coupling and the @xmath53 coupling can be simultaneously enhanced by @xmath54 , while in the l2hdm only the @xmath53 coupling is enhanced by @xmath55 . the structures of the nmssm and the nmssm are described by their superpotentials and corresponding soft - breaking terms , which are given by @xcite @xmath56 where @xmath57 is the superpotential of the mssm without the @xmath25 term , @xmath58 and @xmath59 are higgs doublet and singlet superfields with @xmath60 and @xmath61 being their scalar component respectively , @xmath62 , @xmath63 , @xmath64 , @xmath65 , @xmath66 and @xmath67 are soft breaking parameters , and @xmath68 and @xmath69 are coefficients of the higgs self interactions .    with the superpotentials and the soft - breaking terms , one can get the higgs potentials of the nmssm and the nmssm respectively . like the 2hdm , the higgs bosons with same cp property will mix and the mass eigenstates are obtained by diagonalizing the corresponding mass matrices : @xmath70 where the fields on the right hands of the equations are component fields of @xmath71 , @xmath72 and @xmath61 defined by @xmath73 @xmath74 and @xmath75 are respectively the cp - even and cp - odd neutral higgs bosons , @xmath76 and @xmath77 are goldstone bosons eaten by @xmath1 and @xmath78 , and @xmath79 is the charged higgs boson . so both the nmssm and nmssm predict three cp - even higgs bosons , two cp - odd higgs bosons and one pair of charged higgs bosons . in general , the lighter cp - odd higgs @xmath0 in these model is the mixture of the singlet field @xmath80 and the doublet field combination , @xmath81 , i.e. @xmath82 and its couplings to down - type quarks are then proportional to @xmath83 . so for singlet dominated @xmath0 , @xmath84 is small and the couplings are suppressed . as a comparison , the interactions of @xmath0 with the squarks are given by@xcite @xmath85 i.e. the interaction does not vanish when @xmath86 approaches zero . just like the 2hdm where we use the vevs of the higgs fields as fundamental parameters , we choose @xmath68 , @xmath69 , @xmath87 , @xmath88 , @xmath66 and @xmath89 as input parameters for the nmssm@xcite and @xmath68 , @xmath54 , @xmath88 , @xmath65 , @xmath90 and @xmath91 as input parameters for the nmssm@xcite . about the nmssm and the nmssm , three points should be noted . the first is for the two models , there is no explicit @xmath92term , and the effective @xmath25 parameter ( @xmath93 ) is generated when the scalar component of @xmath59 develops a vev . the second is , the nmssm is actually same as the nmssm with @xmath94@xcite , because the tadpole terms @xmath95 and its soft breaking term @xmath96 in the nmssm do not induce any interactions , except for the tree - level higgs boson masses and the minimization conditions . and the last is despite of the similarities , the nmssm has its own peculiarity , which comes from its neutralino sector . in the basis @xmath97 , its neutralino mass matrix is given by @xcite @xmath98 where @xmath99 and @xmath100 are @xmath101 and @xmath102 gaugino masses respectively , @xmath103 , @xmath104 , @xmath105 and @xmath106 . after diagonalizing this matrix one can get the mass eigenstate of the lightest neutralino @xmath107 with mass taking the following form @xcite @xmath108 this expression implies that @xmath107 must be lighter than about @xmath109 gev for @xmath110 ( from lower bound on chargnio mass ) and @xmath111 ( perturbativity bound ) . like the other supersymmetric models , @xmath107 as the lightest sparticle acts as the dark matter in the universe , but due to its singlino - dominated nature , it is difficult to annihilate sufficiently to get the correct density in the current universe . so the relic density of @xmath107 plays a crucial way in selecting the model parameters . for example , as shown in @xcite , for @xmath112 , there is no way to get the correct relic density , and for the other cases , @xmath107 mainly annihilates by exchanging @xmath1 boson for @xmath113 , or by exchanging a light cp - odd higgs boson @xmath0 with mass satisfying the relation @xmath114 for @xmath115 . for the annihilation , @xmath54 and @xmath25 are required to be less than 10 and @xmath116 respectively because through eq.([mass - exp ] ) a large @xmath87 or @xmath25 will suppress @xmath117 to make the annihilation more difficult . the properties of the lightest cp - odd higgs boson @xmath0 , such as its mass and couplings , are also limited tightly since @xmath0 plays an important role in @xmath107 annihilation . the phenomenology of the nmssm is also rather special , and this was discussed in detail in @xcite . in the type - ii 2hdm , l2hdm , nmssm and nmssm , the rare @xmath1-decays @xmath118 ( @xmath22 ) , @xmath3 and @xmath4 may proceed by the feynman diagrams shown in fig.[fig1 ] , fig.[fig2 ] and fig.[fig3 ] respectively . for these diagrams , the intermediate state @xmath119 represents all possible cp - even higgs bosons in the corresponding model , i.e. @xmath41 and @xmath42 in type - ii 2hdm and l2hdm and @xmath41 , @xmath42 and @xmath120 in nmssm and nmssm . in order to take into account the possible resonance effects of @xmath119 in fig.[fig1](c ) for @xmath2 and fig.[fig3 ] ( a ) for @xmath11 , we have calculated all the decay modes of @xmath119 and properly included the width effect in its propagator . as to the decay @xmath121 , two points should be noted . one is , unlike the decays @xmath6 and @xmath11 , this process proceeds only through loops mediated by quarks / leptons in the type - ii 2hdm and l2hdm , and additionally by sparticles in the nmssm and nmssm . so in most cases its rate should be much smaller than the other two . the other is due to cp - invariance , loops mediated by squarks / sleptons give no contribution to the decay@xcite . in actual calculation , this is reflected by the fact that the coupling coefficient of @xmath122 differs from that of @xmath123 by a minus sign ( see eq.([asqsq ] ) ) , and as a result , the squark - mediated contributions to @xmath121 are completely canceled out .    with regard to the rare decay @xmath11 , we have more explanations . in the lowest order , this decay proceeds by the diagram shown in fig.[fig3 ] ( a ) , and hence one may think that , as a rough estimate , it is enough to only consider the contributions from fig.[fig3](a ) . however , we note that in some cases of the type - ii 2hdm and l2hdm , due to the cancelation of the contributions from different @xmath119 in fig.[fig3 ] ( a ) and also due to the potentially largeness of @xmath124 couplings ( i.e. larger than the electroweak scale @xmath125 ) , the radiative correction from the higgs - mediated loops may dominate over the tree level contribution even when the tree level prediction of the rate , @xmath126 , exceeds @xmath20 . on the other hand , we find the contribution from quark / lepton - mediated loops can be safely neglected if @xmath127 in the type - ii 2hdm and the l2hdm . in the nmssm and the nmssm , besides the corrections from the higgs- and quark / lepton - mediated loops , loops involving sparticles such as squarks , charginos and neutralinos can also contribute to the decay . we numerically checked that the contributions from squarks and charginos can be safely neglected if @xmath127 . we also calculated part of potentially large neutralino correction ( note that there are totally about @xmath128 diagrams for such correction ! ) and found they can be neglected too . since considering all the radiative corrections will make our numerical calculation rather slow , we only include the most important correction , namely that from higgs - mediated loops , in presenting our results for the four models . one can intuitively understand the relative smallness of the sparticle contribution to @xmath11 as follows . first consider the squark contribution which is induced by the @xmath129 interaction ( @xmath130 denotes the squark in chirality state ) and the @xmath131 interaction through box diagrams . because the @xmath132 interaction conserves the chirality of the squarks while the @xmath133 interaction violates the chirality , to get non - zero contribution to @xmath11 from the squark loops , at least four chiral flippings are needed , with three of them provided by @xmath131 interaction and the rest provided by the left - right squark mixing . this means that , if one calculates the amplitude in the chirality basis with the mass insertion method , the amplitude is suppressed by the mixing factor @xmath134 with @xmath135 being the off diagonal element in squark mass matrix . next consider the chargino / neutralino contributions . since for a light @xmath0 , its doublet component , parameterized by @xmath84 in eq.([mixing ] ) , is usually small , the couplings of @xmath0 with the sparticles will never be tremendously large@xcite . so the chargino / neutralino contributions are not important too . in our calculation of the decays , we work in the mass eigenstates of sparticles instead of in the chirality basis . for the type - ii 2hdm and the l2hdm , we consider the following constraints @xcite :    * theoretical constraints on @xmath136 from perturbativity , unitarity and requirements that the scalar potential is finit at large field values and contains no flat directions @xcite , which imply that @xmath137 * the constraints from the lep search for neutral higgs bosons . we compute the signals from the higgs - strahlung production @xmath138 ( @xmath139 ) with @xmath140 @xcite and from the associated production @xmath141 with @xmath142 @xcite , and compare them with the corresponding lep data which have been inputted into our code . we also consider the constraints from @xmath138 by looking for a peak of @xmath143 recoil mass distribution of @xmath1-boson @xcite and the constraint of @xmath144 mev when @xmath145 @xcite . + these constraints limit the quantities such as @xmath146 \times br ( h_i \to \bar{b } b ) $ ] on the @xmath147 plane with the the subscript @xmath148 denoting the coupling coefficient of the @xmath149 interaction . they also impose a model - dependent lower bound on @xmath150 , e.g. , @xmath151 for the type - ii 2hdm ( from our scan results ) , @xmath152 for the l2hdm@xcite , and @xmath153 for the nmssm @xcite . these bounds are significantly lower than that of the sm , i.e. @xmath154 , partially because in new physics models , unconventional decay modes of @xmath155 such as @xmath156 are open up . as to the nmssm , another specific reason for allowing a significantly lighter cp - even higgs boson is that the boson may be singlet - dominated in this model . + with regard to the lightest cp - odd higgs boson @xmath0 , we checked that there is no lower bound on its mass so long as the @xmath157 interaction is weak or @xmath155 is sufficiently heavy . * the constraints from the lep search for a light higgs boson via the yukawa process @xmath158 with @xmath22 and @xmath61 denoting a scalar @xcite . these constraints can limit the @xmath159 coupling versus @xmath160 in new physics models . * the constraints from the cleo - iii limit on @xmath161 and the latest babar limits on @xmath162 . these constraints will put very tight constraints on the @xmath163 coupling for @xmath164 . in our analysis , we use the results of fig.8 in the second paper of @xcite to excluded the unfavored points . * the constraints from @xmath165 couplings . since the higgs sector can give sizable higher order corrections to @xmath165 couplings , we calculate them to one loop level and require the corrected @xmath165 couplings to lie within the @xmath166 range of their fitted value . the sm predictions for the couplings at @xmath1-pole are given by @xmath167 and @xmath168 @xcite , and the fitted values are given by @xmath169 and @xmath170 , respectively@xcite . we adopt the formula in @xcite to the 2hdm in our calculation . * the constraints from @xmath171 leptonic decay . we require the new physics correction to the branching ratio @xmath172 to be in the range of @xmath173 @xcite . we use the formula in @xcite in our calculation . + about the constraints ( 5 ) and ( 6 ) , two points should be noted . one is all higgs bosons are involved in the constraints by entering the self energy of @xmath171 lepton , the @xmath174 vertex correction or the @xmath175 vertex correction , and also the box diagrams for @xmath176@xcite . since the yukawa couplings of the higgs bosons to @xmath171 lepton get enhanced by @xmath54 and so do the corrections , @xmath54 must be upper bounded for given spectrum of the higgs sector . generally speaking , the lighter @xmath0 is , the more tightly @xmath54 is limited@xcite . the other point is in the type - ii 2hdm , @xmath177 , b - physics observables as well as @xmath178 decays discussed above can constraint the model in a tighter way than the constraints ( 5 ) and ( 6 ) since the yukawa couplings of @xmath171 lepton and @xmath179 quark are simultaneously enhanced by @xmath54 . but for the l2hdm , because only the yukawa couplings of @xmath171 lepton get enhanced ( see eq.[yukawa ] ) , the constraints ( 5 ) and ( 6 ) are more important in limiting @xmath54 . * indirect constraints from the precision electroweak observables such as @xmath180 , @xmath181 and @xmath182 , or their combinations @xmath183 @xcite . we require @xmath184 to be compatible with the lep / sld data at @xmath185 confidence level@xcite . we also require new physics prediction of @xmath186 is within the @xmath187 range of its experimental value . the latest results for @xmath188 are @xmath189 ( measured value ) and @xmath190 ( sm prediction ) for @xmath191 gev @xcite . in our code , we adopt the formula for these observables presented in @xcite to the type - ii 2hdm and the l2hdm respectively . + in calculating @xmath180 , @xmath181 and @xmath182 , we note that these observables get dominant contributions from the self energies of the gauge bosons @xmath1 , @xmath192 and @xmath193 . since there is no @xmath194 coupling or @xmath195 coupling , @xmath0 must be associated with the other higgs bosons to contribute to the self energies . so by the uv convergence of these quantities , one can infer that , for the case of a light @xmath0 and @xmath196 , these quantities depend on the spectrum of the higgs sector in a way like @xmath197 at leading order , which implies that a light @xmath0 can still survive the constraints from the precision electroweak observables given the splitting between @xmath150 and @xmath198 is moderate@xcite . * the constraints from b physics observables such as the branching ratios for @xmath199 , @xmath200 and @xmath201 , and the mass differences @xmath202 and @xmath203 . we require their theoretical predications to agree with the corresponding experimental values at @xmath187 level . + in the type - ii 2hdm and the l2hdm , only the charged higgs boson contributes to these observables by loops , so one can expect that @xmath198 versus @xmath54 is to be limited . combined analysis of the limits in the type - ii 2hdm has been done by the ckmfitter group , and the lower bound of @xmath204 as a function of @xmath87 was given in fig.11 of @xcite . this analysis indicates that @xmath198 must be heavier than @xmath205 at @xmath185 c.l . regardless the value of @xmath54 . in this work , we use the results of fig.11 in @xcite to exclude the unfavored points . as for the l2hdm , b physics actually can not put any constraints@xcite because in this model the couplings of the charged higgs boson to quarks are proportional to @xmath206 and in the case of large @xmath54 which we are interested in , they are suppressed . in our analysis of the l2hdm , we impose the lep bound on @xmath198 , i.e. @xmath207@xcite . * the constraints from the muon anomalous magnetic moment @xmath208 . now both the theoretical prediction and the experimental measured value of @xmath208 have reached a remarkable precision , but a significant deviation still exists : @xmath209 @xcite . in the 2hdm , @xmath208 gets additional contributions from the one - loop diagrams induced by the higgs bosons and also from the two - loop barr - zee diagrams mediated by @xmath0 and @xmath155@xcite . if the higgs bosons are much heavier than @xmath25 lepton mass , the contributions from the barr - zee diagrams are more important , and to efficiently alleviate the discrepancy of @xmath208 , one needs a light @xmath0 along with its enhanced couplings to @xmath25 lepton and also to heavy fermions such as bottom quark and @xmath171 lepton to push up the effects of the barr - zee diagram@xcite . the cp - even higgs bosons are usually preferred to be heavy since their contributions to @xmath208 are negative . + in the type - ii 2hdm , because @xmath54 is tightly constrained by the process @xmath210 at the lep@xcite and the @xmath178 decay@xcite , the barr - zee diagram contribution is insufficient to enhance @xmath208 to @xmath187 range around its measured value@xcite . so in our analysis , we require the type - ii 2hdm to explain @xmath208 at @xmath211 level . while for the l2hdm , @xmath54 is less constrained compared with the type - ii 2hdm , and the barr - zee diagram involving the @xmath171-loop is capable to push up greatly the theoretical prediction of @xmath208@xcite . therefore , we require the l2hdm to explain the discrepancy at @xmath187 level . + unlike the other constraints discussed above , the @xmath208 constraint will put a two - sided bound on @xmath54 since on the one hand , it needs a large @xmath54 to enhance the barr - zee contribution , but on the other hand , too large @xmath54 will result in an unacceptable large @xmath208 . * since this paper concentrates on a light @xmath0 , the decay @xmath212 is open up with a possible large decay width . we require the width of any higgs boson to be smaller than its mass to avoid a too fat higgs boson@xcite . we checked that for the scenario characterized by @xmath213 , the coefficient of @xmath214 interaction is usually larger than the electroweak scale @xmath125 , and consequently a large decay width is resulted . for the nmssm and nmssm , the above constraints become more complicated because in these models , not only more higgs bosons are involved in , but also sparticles enter the constraints . so it is not easy to understand some of the constraints intuitively . take the process @xmath199 as an example . in the supersymmetric models , besides the charged higgs contribution , chargino loops , gluino loops as well as neutralino loops also contribute to the process@xcite , and depending on the susy parameters , any of these contributions may become dominated over or be canceled by other contributions . as a result , although the charged higgs affects the process in the same way as that in the type - ii 2hdm , charged higgs as light as @xmath215 is still allowed even for @xmath216@xcite .    since among the constraints , @xmath208 is rather peculiar in that it needs new physics to explain the discrepancy between @xmath217 and @xmath218 , we discuss more about its dependence on susy parameters . in the nmssm and the nmssm , @xmath208 receives contributions from higgs loops and neutralino / chargino loops . for the higgs contribution , it is quite similar to that of the type - ii 2hdm except that more higgs bosons are involved in@xcite . for the neutralino / chargino contribution , in the light bino limit ( i.e. @xmath219 ) , it can be approximated by@xcite @xmath220 for @xmath221 with @xmath222 being smuon mass . so combining the two contributions together , one can learn that a light @xmath0 along with large @xmath54 and/or light smuon with moderate @xmath87 are favored to dilute the discrepancy .    because more parameters are involved in the constraints on the supersymmetric models , we consider following additional constraints to further limit their parameters :    * direct bounds on sparticle masses from the lep1 , the lep2 and the tevatron experiments @xcite . * the lep1 bound on invisible z decay @xmath223 ; the lep2 bound on neutralino production @xmath224 and @xmath225@xcite . * dark matter constraints from the wmap relic density 0.0975 @xmath226 0.1213 @xcite . note that among the above constraints , the constraint ( 2 ) on higgs sector and the constraint ( c ) on neutralino sector are very important . this is because in the supersymmetric models , the sm - like higgs is upper bounded by about @xmath227 at tree level and by about @xmath228 at loop level , and that the relic density restricts the lsp annihilation cross section in a certain narrow range .    in our analysis of the nmssm , we calculate the constraints ( 3 ) and ( 5 - 7 ) by ourselves and utilize the code nmssmtools @xcite to implement the rest constraints . we also extend nmssmtools to the nmssm to implement the constraints . for the extension , the most difficult thing we faced is how to adapt the code micromegas@xcite to the nmssm case . we solve this problem by noting the following facts :    * as we mentioned before , the nmssm is actually same as the nmssm with the trilinear singlet term setting to zero . so we can utilize the model file of the nmssm as the input of the micromegas and set @xmath229 . * since in the nmssm , the lsp is too light to annihilate into higgs pairs , there is no need to reconstruct the effective higgs potential to calculate precisely the annihilation channel @xmath230 with @xmath61 denoting any of higgs bosons@xcite . we thank the authors of the nmssmtools for helpful discussion on this issue when we finish such extension@xcite . with the above constraints , we perform four independent random scans over the parameter space of the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively . we vary the parameters in following ranges : @xmath231 for the type - ii 2hdm , @xmath232 for the l2hdm , @xmath233 for the nmssm , and @xmath234 for the nmssm .    in performing the scans , we note that for the nmssm and the nmssm , some constraints also rely on the gaugino masses and the soft breaking parameters in the squark sector and the slepton sector . since these parameters affect little on the properties of @xmath0 , we fix them to reduce the number of free parameters in our scan . for the squark sector , we adopt the @xmath235 scenario which assumes that the soft mass parameters for the third generation squarks are degenerate : @xmath236 800 gev , and that the trilinear couplings of the third generation squarks are also degenerate , @xmath237 with @xmath238 . for the slepton sector , we assume all the soft - breaking masses and trilinear parameters to be 100 gev . this setting is necessary for the nmssm since this model is difficult to explain the muon anomalous moment at @xmath239 level for heavy sleptons@xcite . finally , we assume the grand unification relation @xmath240 for the gaugino masses with @xmath241 being fine structure constants of the different gauge group .    with large number of random points in the scans , we finally get about @xmath242 , @xmath243 , @xmath244 and @xmath242 samples for the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively which survive the constraints and satisfy @xmath245 . analyzing the properties of the @xmath0 indicates that for most of the surviving points in the nmssm and the nmssm , its dominant component is the singlet field ( numerically speaking , @xmath246 ) so that its couplings to the sm fermions are suppressed@xcite . our analysis also indicates that the main decay products of @xmath0 are @xmath247 for the l2hdm@xcite , @xmath248 ( dominant ) and @xmath247 ( subdominant ) for the type - ii 2hdm , the nmssm and the nmssm , and in some rare cases , neutralino pairs in the nmssm@xcite .    in fig.[fig4 ] , we project the surviving samples on the @xmath249 plane . this figure shows that the allowed range of @xmath54 is from @xmath250 to @xmath251 in the type - ii 2hdm , and from @xmath252 to @xmath253 in the l2hdm . just as we introduced before , the lower bounds of @xmath254 come from the fact that we require the models to explain the muon anomalous moment , while the upper bound is due to we have imposed the constraint from the lep process @xmath255 , which have limited the upper reach of the @xmath256 coupling for light @xmath61 @xcite(for the dependence of @xmath256 coupling on @xmath54 , see sec . this figure also indicates that for the nmssm and the nmssm , @xmath54 is upper bounded by @xmath257 . for the nmssm , this is because large @xmath87 can suppress the dark matter mass to make its annihilation difficult ( see @xcite and also sec . ii ) , but for the nmssm , this is because we choose a light slepton mass so that large @xmath54 can enhance @xmath208 too significantly to be experimentally unacceptable . we checked that for the slepton mass as heavy as @xmath258 , @xmath259 is still allowed for the nmssm .    in fig.[fig5 ] and fig.[fig6 ] , we show the branching ratios of @xmath260 and @xmath261 respectively . fig.[fig5 ] indicates , among the four models , the type - ii 2hdm predicts the largest ratio for @xmath260 with its value varying from @xmath262 to @xmath263 . the underlying reason is in the type - ii 2hdm , the @xmath264 coupling is enhanced by @xmath54 ( see fig.[fig4 ] ) , while in the other three model , the coupling is suppressed either by @xmath265 or by the singlet component of the @xmath0 . fig.[fig6 ] shows that the l2hdm predicts the largest rate for @xmath266 with its value reaching @xmath5 in optimum case , and for the other three models , the ratio of @xmath261 is at least about one order smaller than that of @xmath267 . this feature can be easily understood from the @xmath268 coupling introduced in sect . we emphasize that , if the nature prefers a light @xmath0 , @xmath260 and/or @xmath269 in the type - ii 2hdm and the l2hdm will be observable at the gigaz . then by the rates of the two decays , one can determine whether the type - ii 2hdm or the l2hdm is the right theory . on the other hand , if both decays are observed with small rates or fail to be observed , the singlet extensions of the mssm are favored .    in fig.[fig7 ] , we show the rate of @xmath3 as the function of @xmath270 . this figure indicates that the branching ratio of @xmath121 can reach @xmath271 , @xmath272 , @xmath273 and @xmath274 for the optimal cases of the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively , which implies that the decay @xmath121 will never be observable at the gigaz if the studied model is chosen by nature . the reason for the smallness is , as we pointed out before , that the decay @xmath121 proceeds only at loop level . comparing the optimum cases of the type - ii 2hdm , the nmssm and the nmssm shown in fig.5 - 7 , one may find that the relation @xmath275 holds for any of the decays . this is because the decays are all induced by the yukawa couplings with similar structure for the models . in the supersymmetric models , the large singlet component of the light @xmath0 is to suppress the yukawa couplings , and the @xmath0 in the nmssm has more singlet component than that in the nmssm . next we consider the decay @xmath11 , which , unlike the above decays , depends on the higgs self interactions . in fig.[fig8 ] we plot its rate as a function of @xmath270 and this figure indicates that the @xmath276 may be the largest among the ratios of the exotic @xmath1 decays , reaching @xmath277 in the optimum cases of the type - ii 2hdm , the l2hdm and the nmssm . the underlying reason is , in some cases , the intermediate state @xmath119 in fig.[fig3 ] ( a ) may be on - shell . in fact , we find this is one of the main differences between the nmssm and the nmssm , that is , in the nmssm , @xmath119 in fig.[fig3 ] ( a ) may be on - shell ( corresponds to the points with large @xmath278 ) while in the nmssm , this seems impossible . so we conclude that the decay @xmath11 may serve as an alternative channel to test new physics models , especially it may be used to distinguish the nmssm from the nmssm if the supersymmetry is found at the lhc and the @xmath11 is observed at the gigaz with large rate . before we end our discussion , we note that in the nmssm , the higgs boson @xmath0 may be lighter than @xmath279 without conflicting with low energy data from @xmath178 decays and the other observables ( see fig.[fig4]-[fig8 ] ) . in this case , @xmath0 is axion - like as pointed out in @xcite . we checked that , among the rare @xmath1 decays discussed in this paper , the largest branching ratio comes from @xmath280 which can reach @xmath281 . since in this case , the decay product of @xmath0 is highly collinear muon pair , detecting the decay @xmath280 may need some knowledge about detectors , which is beyond our discussion . in this paper , we studied the rare @xmath1-decays @xmath2 ( @xmath7 ) , @xmath282 and @xmath4 in the type - ii 2hdm , lepton - specific 2hdm , nmssm and nmssm , which predict a light cp - odd higgs boson @xmath0 . in the parameter space allowed by current experiments , the branching ratio can be as large as @xmath5 for @xmath118 , @xmath8 for @xmath3 and @xmath9 for @xmath4 , which implies that the decays @xmath2 and @xmath283 may be accessible at the gigaz option . since different models predict different size of branching ratios , these decays can be used to distinguish different model through the measurement of these rare decays . this work was supported in part by hastit under grant no . 2009hastit004 , by the national natural science foundation of china ( nnsfc ) under grant nos . 10821504 , 10725526 , 10635030 , 10775039 , 11075045 and by the project of knowledge innovation program ( pkip ) of chinese academy of sciences under grant no . .        for some reviews , see , e.g. , m.  a.  perez , g.  tavares - velasco and j.  j.  toscano , int . j.  mod . a * 19 * , 159 ( 2004 ) ; j. m. yang , arxiv:1006.2594 . j.  i.  illana , m.  masip , 67 , 035004 ( 2003 ) ; j. cao , z. xiong , j. m. yang , 32 , 245 ( 2004 ) . d. atwood _ et al_. , 66 , 093005 ( 2002 ) . j. kalinowski , and s. pokorski , 219 , 116 ( 1989 ) ; a. djouadi , p. m. zerwas and j. zunft , 259 , 175 ( 1991 ) ; a. djouadi , j. kalinowski , and p. m. zerwas , z. phys . c * 54 * , 255 ( 1992 ) . m. krawczyk , _ et al . _ , 19 , 463 ( 2001 ) ; 8 , 495 ( 1999 ) . j. f. gunion , g. gamberini and s. f. novaes , 38 , 3481 ( 1988 ) ; thomas j. weiler and tzu - chiang yuan , 318 , 337 ( 1989 ) ; a. djouadi , _ et al . _ , 1 , 163 ( 1998)[hep - ph/9701342 ] . d.  chang and w.  y.  keung , phys . lett .  * 77 * , 3732 ( 1996 ) . e.  keith and e.  ma , 57 , 2017 ( 1998 ) ; m.  a.  perez , g.  tavares - velasco and j.  j. toscano , int . j.  mod.phys . a * 19 * , 159 ( 2004 ) . f.  larios , g.  tavares - velasco and c. p.  yuan , 64 , 055004 ( 2001 ) ; 66 , 075006 ( 2002 ) . a. djouadi , _ et al . _ , 10 , 27 ( 1999 ) [ hep - ph/9903229 ] . for a detailed introduction of the nmssm , see f.  franke and h. fraas , int . j.  mod . a * 12 * ( 1997 ) 479 ; for a recent review of the nmssm , see for example , u. ellwanger , c. hugonie , and a. m. teixeira , arxiv : 0910.1785 . see , e.g. , j.  r.  ellis , j.  f.  gunion , h.  e.  haber , l.  roszkowski and f.  zwirner , phys .  rev . d * 39 * ( 1989 ) 844 ; m.  drees , int . j.  mod . phys .  a * 4 * ( 1989 ) 3635 ; u.  ellwanger , m.  rausch de traubenberg and c.  a.  savoy , phys . b * 315 * ( 1993 ) 331 ; nucl . b * 492 * ( 1997 ) 21 ; d.j . miller , r. nevzorov , p.m. zerwas , 681 , 3 ( 2004 ) .    c.  panagiotakopoulos , k.  tamvakis , 446 , 224 ( 1999 ) ; 469 , 145 ( 1999 ) ; c. panagiotakopoulos , a. pilaftsis , 63 , 055003 ( 2001 ) ; a.  dedes , _ et al . _ , 63 , 055009 ( 2001 ) ; a.  menon , _ et al . _ , 70 , 035005 ( 2004 ) ; v.  barger , _ et al . _ , 630 , 85 ( 2005 ) . c.  balazs , _ et al . _ , 0706 , 066 ( 2007 ) . b. a. dobrescu , k. t. matchev , 0009 , 031 ( 2000 ) ; a. arhrib , k. cheung , t. j. hou , k. w. song , hep - ph/0611211 ; 0703 , 073 ( 2007 ) ; x. g. he , j. tandean , and g. valencia , 98 , 081802 ( 2007 ) ; 0806 , 002 ( 2008 ) ; f. domingo _ et al_. , 0901 , 061 ( 2009 ) ; gudrun hiller , 70 , 034018 ( 2004 ) ; r. dermisek , and john f. gunion , 75 , 075019 ( 2007 ) ; 79 , 055014 ( 2009 ) ; 81 , 055001 ( 2010 ) ; r. dermisek , john f. gunion , and b. mcelrath , 76 , 051105 ( 2007 ) ; z. heng , _ et al_. , 77 , 095012 ( 2008 ) ; a. belyaev _ et al_. , 81 , 075021 ( 2010 ) ; d. das and u.  ellwanger , arxiv:1007.1151 [ hep - ph ] . s.  andreas , o.  lebedev , s.  ramos - sanchez and a.  ringwald , arxiv:1005.3978 [ hep - ph ] . j.  f.  gunion , jhep * 0908 * , 032 ( 2009 ) ; r. dermisek and j.  f.  gunion , phys .  rev . d * 81 * , 075003 ( 2010 ) . r.  dermisek and j.  f. gunion , phys . lett .   * 95 * , 041801 ( 2005 ) ; phys . d * 73 * , 111701 ( 2006 ) . j. cao , h. e. logan , j. m. yang , 79 , 091701 ( 2009 ) . j. cao , p. wan , l. wu , j. m. yang , 80 , 071701 ( 2009 ) . j. f. gunion and h. e. haber , 67 , 075019 ( 2003 ) . r.  m.  barnett , _ et al . _ , phys . b * 136 * , 191 ( 1984 ) ; r.  m.  barnett , g.  senjanovic and d.  wyler , phys . d * 30 * , 1529 ( 1984 ) ; y.  grossman , nucl . b * 426 * , 355 ( 1994 ) . h.  s.  goh , l.  j.  hall and p. kumar , jhep * 0905 * , 097 ( 2009 ) ; a.  g. akeroyd and w.  j.  stirling , nucl . b * 447 * , 3 ( 1995 ) ; a.  g.  akeroyd , phys . b * 377 * , 95 ( 1996 ) ; h.  e.  logan and d.  maclennan , phys .  rev . d * 79 * , 115022 ( 2009 ) ; m. aoki , _ et al . _ , arxiv:0902.4665 [ hep - ph ] . v.  barger , p.  langacker , h.  s.  lee and g. shaughnessy , phys . d * 73 * , 115010 ( 2006 ) . s. hesselbach , _ et . _ , arxiv:0810.0511v2 [ hep - ph ] . de vivie and p.  janot [ aleph collaboration ] , pa13 - 027 contribution to the international conference on high energy physics , warsaw , poland , 2531 july 1996 ; j. kurowska , o.  grajek and p.  zalewski [ delphi collaboration ] , cern - open-99 - 385 . [ aleph collaboration and delphi collaboration and l3 collaboration ] , phys . rept .   * 427 * , 257 ( 2006 ) . j.  cao and j.  m.  yang , jhep * 0812 * , 006 ( 2008 ) . m.  krawczyk and d.  temes , eur . j.   c * 44 * , 435 ( 2005 ) . g.  altarelli and r.  barbieri , 253 , 161 ( 1991 ) ; m. e. peskin , t. takeuchi , 46 , 381 ( 1992 ) . c. amsler , _ et al . _ , ( particle data group ) , 667 , 1 ( 2008 ) . o. deschamps , s.  descotes - genon , s.  monteil , v.  niess , s.  tjampens and v.  tisserand , arxiv:0907.5135 [ hep - ph ] . s.  su and b. thomas , phys . d * 79 * , 095014 ( 2009 ) . g. abbiendi , _ et al . _ , eur .  phys . j.   c * 32 * , 453 ( 2004 ) . m.  davier , _ et al . _ , 66 , 1 ( 2010 ) . k.  cheung , _ et al . _ , phys . d * 64 * , 111301 ( 2001 ) . k.  cheung and o.  c.  w. kong , phys . d * 68 * , 053003 ( 2003 ) . t. besmer , c. greub , t.hurth , 609 , 359 ( 2001 ) ; f. borzumati , _ et al . _ , 62 , 075005(2000 ) . j.  cao , k.  i.  hikasa , w.  wang , j.  m.  yang and l.  x.  yu , phys . d * 82 * , 051701 ( 2010 ) [ arxiv:1006.4811 [ hep - ph ] ] . j.  f.  gunion , _ et . d * 73 * , 015011 ( 2006 ) . martin and j.  d.  wells , phys . d * 64 * , 035003 ( 2001 ) . j.  abdallah _ et al . _ , eur . j.   c * 31 * , 421 ( 2004 ) ; g.  abbiendi _ et al . _ , eur . j. c * 35 * , 1 ( 2004 ) . j.  dunkley _ et al . _ [ wmap collaboration ] , astrophys . j.  suppl . * 180 * , 306 ( 2009 ) [ arxiv:0803.0586 [ astro - ph ] ] . u. ellwanger _ et al . _ , 02 , 066 ( 2005 ) . g.  belanger , f.  boudjema , a.  pukhov and a.  semenov , comput . commun .   * 174 * , 577 ( 2006 ) ; comput . phys .  commun . * 176 * , 367 ( 2007 ) . g.  belanger , f.  boudjema , c. hugonie , a.  pukhov and a.  semenov , jcap * 0509 * , 001 ( 2005 ) ."""
-
-        ARTICLE_MAGNET = r"""it is well known that the classical magnetoresistance ( mr ) in metals or semiconductors with a closed free electron fermi surface increases quadratically with increasing magnetic field @xmath2 for @xmath3 and saturates when @xmath4 . here @xmath5 is the zero - magnetic - field mobility . hence , the extraordinarily high and linear mr ( lmr ) , which breaks this familiar rule , has been gaining much attention as soon as its discovery . in the past decade , this unexpected lmr has been reported in silver chalcogenide,@xcite indium antimonide,@xcite silicon,@xcite mnas - gaas composite material,@xcite and graphene.@xcite    kapitza s linear law@xcite indicates that the metal shows a magnetoresistance linear in perpendicular magnetic field when it has an open fermi surface and a mean free path longer than the electronic larmor radius . recently , another two models , irrespective of the open fermi surface , have been constructed to provide possible mechanisms for the lmr phenomenon . abrikosov suggested a quantum - limit origin of lmr for the homogenous system with a gapless linear energy spectrum.@xcite his model requires that landau levels are well formed and the carrier concentration is small that all electrons occupy only the lowest landau band . alternatively , parish and littlewood developed a classical model without involving linear spectrum.@xcite ignoring the concrete microscopic mechanism , they attributed this unusual mr to the mobility fluctuations in a strongly inhomogenous system . topological insulators@xcite ( tis ) are novel materials with a full energy gap in bulk , while there are gapless surface states . due to its unique band structure with only one helical dirac cone and linear energy dispersion,@xcite the surface states of the ti bi@xmath0se@xmath1 become an excellent platform for the study of quantum - limit lmr . the recent experiment in this flat surface system , however , reported that a large positive mr , which becomes very linear above a characteristic field of @xmath6@xmath7@xmath8 t , was observed even in an opposite situation where the carrier sheet density is high that electrons occupy more than one landau levels.@xcite moreover , they found that raising temperature to room temperature almost has no influence on the observed lmr . it is striking that this observation is in conflict with abrikosov s model and also with the classical parish - littlewood model . so far a reliable theoretical scheme capable of explaining this novel experiment has still been lacking .    in this paper , we generalize the balance - equation approach@xcite to a system modeling the surface states of a three - dimensional ti to investigate the two - dimensional magnetotransport in it . we find that a positive , nonsaturating and dominantly linear magnetoresistance can appear within quite wide magnetic - field range in the ti surface state having a positive and finite effective g - factor . this linear magnetoresistance shows up in the system of high carrier concentration and low mobility when electrons are in extended states and spread over many smeared landau levels , and persists up to room temperature , providing a possible mechanism for the recently observed linear magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons.@xcite we consider the surface state of a bi@xmath0se@xmath1-type large bulk gap ti in the @xmath9-@xmath10 plane under the influence of a uniform magnetic field @xmath11 applied along the @xmath12 direction.@xcite following the experimental observation,@xcite we assume that the fermi energy locates in the gap of the bulk band and above the dirac point , i.e. the surface carriers are electrons . further , the separations of the fermi energy from the bottom of bulk band and dirac point are much larger than the highest temperature ( @xmath13 ) considered in this work . hence , the contribution from the bulk band to the magnetotransport is negligible . these electrons , scattered by randomly distributed impurities and by phonons , are driven by a uniform in - plane electric field @xmath14 in the topological surface . the hamiltonian of this many - electron and phonon system consists of an electron part @xmath15 , a phonon part @xmath16 , and electron - impurity and electron - phonon interactions @xmath17 and @xmath18 : @xmath19 here , the electron hamiltonian is taken in the form @xmath20 , \ ] ] in which @xmath21 , @xmath22 , @xmath23 and @xmath24 , stand , respectively , for the canonical momentum , coordinate , momentum and spin operators of the @xmath25th electron having charge @xmath26 , @xmath27 is the vector potential of the perpendicular magnetic field @xmath28 in the landau gauge , @xmath29 is the fermi velocity , @xmath30 is the effective g - factor of the surface electron , and @xmath31 is the bohr magneton with @xmath32 the free electron mass . the sum index @xmath25 in eq.([helectron ] ) goes over all electrons of total number @xmath33 in the surface state of unit area .    in the frame work of balance equation approach,@xcite the two - dimensional center - of - mass ( c.m . ) momentum and coordinate @xmath34 and @xmath35 , and the relative - electron momenta and coordinates @xmath36 and @xmath37 are introduced to write the hamiltonian @xmath15 into the sum of a single - particle c.m . part @xmath38 and a many - particle relative - electron part @xmath39 : @xmath40 , with @xmath41.\end{aligned}\ ] ] in this , @xmath42 is the canonical momentum of the center - of - mass and @xmath43 is the canonical momentum for the @xmath25th relative electron . here we have also introduced c.m . spin operators @xmath44 and @xmath45 . the commutation relations between the c.m . spin operators @xmath46 and @xmath47 and the spin operators @xmath48 , @xmath49 and @xmath50 of the @xmath25th electron are of order of @xmath51 : @xmath52= n^{-1}2\,{\rm i}\,\varepsi lon_{\beta_1\beta_2\beta_3}\sigma_j^{\beta_3}$ ] with @xmath53 . therefore , for a macroscopic large @xmath33 system , the c.m . part @xmath38 actually commutes with the relative - electron part @xmath54 in the hamiltonian , i.e. the c.m . motion and the relative motion of electrons are truly separated from each other . the couplings between the two emerge only through the electron impurity and electron  phonon interactions . furthermore , the electric field @xmath55 shows up only in @xmath38 . and , in view of @xmath56={\rm i}\delta_{\alpha \beta}(\delta_{ij}-1/n)\simeq { \rm i}\delta_{\alpha\beta}\delta_{ij}$ ] , i.e. the relative - electron momenta and coordinates can be treated as canonical conjugate variables , the relative - motion part @xmath54 is just the hamiltonian of @xmath33 electrons in the surface state of ti in the magnetic field without the presence of the electric field .    in terms of the c.m . coordinate @xmath57 and the relative electron density operator @xmath58 , the electron impurity and electron  phonon interactions can be written as@xcite @xmath59 here @xmath60 and @xmath61 are respectively the impurity potential ( an impurity at randomly distributed position @xmath62 ) and electron  phonon coupling matrix element in the plane - wave representation , and @xmath63 with @xmath64 and @xmath65 being the creation and annihilation operators for a phonon of wavevector @xmath66 in branch @xmath67 having frequency @xmath68 . velocity ( operator ) @xmath69 is the time variation of its coordinate : @xmath70= v_{\rm f}(\sigma_{\rm c}^y\ , \hat{i}-\sigma_{\rm c}^x\ , \hat{j})$ ] . to derive a force - balance equation for steady state transport we consider the heisenberg equation for the rate of change of the c.m . canonical momentum @xmath71 : @xmath72= - n e({\bm v}\times { \bm b})- n e{\bm e}+{\bm { f}}_{\rm i}+{\bm { f}}_{\rm p},\ ] ] in which the frictional forces @xmath73 and @xmath74 share the same expressions as given in ref ..    the statistical average of the operator equation can be determined to linear order in the electron  impurity and electron phonon interactions @xmath17 and @xmath18 with the initial density matrix @xmath75 at temperature @xmath76 when the in - plane electric field @xmath77 is not strong . for steady - transport states we have @xmath78 , leading to a force - balance equation of the form @xmath79 here @xmath80 , the statistically averaged velocity of the moving center - of - mass , is identified as the average rate of change of its position , i.e. the drift velocity of the electron system driven by the electric field @xmath77 , and @xmath81 and @xmath82 are frictional forces experienced by the center - of - mass due to impurity and phonon scatterings : @xmath83,\label{fp}\end{aligned}\ ] ] in which @xmath84 is the bose distribution function , @xmath85 , and @xmath86 stands for the imaginary part of the fourier spectrum of the relative - electron density correlation function defined by @xmath87\big\rangle_{0},\ ] ] where @xmath88 and @xmath89 denotes the statistical averaging over the initial density matrix @xmath90.@xcite    the force - balance equation describes the steady - state two - dimensional magnetotransport in the surface state of a ti . note that the frictional forces @xmath81 and @xmath82 are in the opposite direction of the drift velocity @xmath91 and their magnitudes are functions of @xmath92 only . with the drift velocity @xmath93 in the @xmath9 direction , the force - balance equation eq . yields a transverse resistivity @xmath94 , and a longitudinal resistivity @xmath95 . the linear one is in the form @xmath96 for calculating the electron density correlation function @xmath97 we proceed in the landau representation.@xcite the landau levels of the single - particle hamiltonian @xmath98 of the relative - electron system in the absence of electric field are composed of a positive `` @xmath99 '' and a negative `` @xmath100 '' branch@xcite @xmath101 with @xmath102 and @xmath103 , and a zero ( @xmath104 ) level @xmath105 the corresponding landau wave functions are @xmath106 and @xmath107 for @xmath108 ; and @xmath109 for @xmath104 . here @xmath110 is the wavevector of the system along @xmath9 direction ; @xmath111 with @xmath112 ; and @xmath113 is the harmonic oscillator eigenfunction with @xmath114 being the hermite polynomial , @xmath115 , and @xmath116 . each landau level contains @xmath117 electron states for system of unit surface area . the positive branch @xmath118 and the @xmath104 level @xmath119 of the above energy spectra are indeed quite close to those of the surface states in the bulk gap of bi@xmath0se@xmath1-family materials derived from microscopic band calculation.@xcite    the landau levels are broadened due to impurity , phonon and electron - electron scatterings . we model the imaginary part of the retarded green s function , or the density - of - states , of the broadened landau level @xmath120 ( written for `` + ' ' -branch and @xmath104 levels ) , using a gaussian - type form:@xcite @xmath121,\ ] ] with a half - width @xmath122 of the form:@xcite @xmath123^{1/2}$ ] . here @xmath124 is the single - particle lifetime and @xmath125 is the cyclotron frequency of linear - energy - dispersion system with @xmath126 being the zero - temperature fermi level . using a semi - empirical parameter @xmath127 to relate @xmath124 with the transport scattering time @xmath128 , and expressing @xmath129 with the zero - field mobility @xmath5 at finite temperature,@xcite we can write the landau - level broadening as @xmath130^{1/2}.\ ] ]    in the present study we consider the case of @xmath120-doping , i.e. the fermi level is high enough above the energy zero of the dirac cone in the range of `` + ' ' -branch levels and the states of `` @xmath100''-branch levels are completely filled , that they are irrelevant to electron transport . special attention has to be paid to the @xmath104 level , since , depending on the direction of exchange potential the effective g - factor of a ti surface state , @xmath30 , can be positive , zero or negative.@xcite the sign and magnitude of the effective g - factor determines how many states of the zero level should be included in or excluded from the available states for electron occupation in the case of @xmath120-doping at a magnetic field . ( i ) if @xmath131 , the @xmath104 level center is exactly at @xmath132 and the system is electron - hole symmetric . the total number of negative energy states ( including the states of the lower half of the @xmath104 level and states of the @xmath100"-branch levels ) and that of positive energy states ( including the states of the upper half of the @xmath104 level and states of the @xmath99"-branch levels ) do not change when changing magnetic field . therefore , the lower - half negative energy states of this level are always filled and the upper - half positive - energy states of it are available for the occupation of particles which are counted as electrons participating in transport in the case of @xmath120-doping . ( ii ) for a finite positive @xmath133 , the @xmath104 level @xmath134 moves downward to negative energy and its distance to the nearest  @xmath100"-branch level is @xmath135 closer than to the nearest  + " -branch level at finite magnetic field strength @xmath2 . this is equivalent to the opening of an increasingly enlarged ( with increasing @xmath2 ) energy gap between the  + " -branch states and the states of the zero - level and the  @xmath100"-branch levels . the opening of a sufficient energy gap implies that with increasing magnetic field the states in the  + " -branch levels would no longer shrink into the zero - level , and thus the @xmath104 level should be completely excluded from the conduction band , i.e. only particles occupying the  + " -branch states are counted as electrons participating in transport in the case of @xmath120-doping , when the magnetic field @xmath2 gets larger than a certain value ( depending on the magnitude of @xmath30 ) . ( iii ) for a finite negative @xmath136 , the @xmath104 level @xmath134 moves upward to positive energy and an increasingly enlarged energy gap will be opened between the states of the zero - level and the  + " -branch and the states of  @xmath100"-branch levels , and particles occupying the @xmath104 level and  + " -branch states are electrons participating in transport when the magnetic field @xmath2 gets larger than a certain value .    as a result , the experimentally accessible sheet density @xmath33 of electrons participating in transport is related to the fermi energy @xmath137 by the following equation valid at finite @xmath30 for the magnetic field @xmath2 larger than a certain value : @xmath138 in which @xmath139 + 1\}^{-1}$ ] is the fermi distribution function at temperature @xmath76 and the summation index @xmath120 goes over @xmath140 for @xmath133 , or @xmath141 for @xmath136 . in the case of @xmath131 , @xmath142\ ] ] valid for arbitrary magnetic field , in which @xmath143 . the imaginary part of relative - electron density correlation function in the presence of a magnetic field , @xmath86 , can be expressed in the landau representation as@xcite @xmath144 in which the transform factor @xmath145 ^ 2,\end{aligned}\ ] ] with @xmath146 , @xmath147 , @xmath148 , and @xmath149 being associated laguerre polynomials . the landau - representation correlation function @xmath150 in eq.([piqw ] ) can be constructed with the imaginary part of the retarded green s function @xmath151 , or the density - of - states , of the @xmath120th landau level as@xcite @xmath152\nonumber\\ & \hspace{1.2cm}\times{\rm im}g_n(\epsilon+\omega){\rm im}g_{n'}(\epsilon).\end{aligned}\ ] ] the summation indices @xmath120 and @xmath153 in eq.([piqw ] ) are taken over @xmath140 for @xmath133 , or @xmath154 for @xmath136 . in the case of @xmath131 , eq.([piqw ] ) still works and the summation indices @xmath120 and @xmath153 go over @xmath154 but with @xmath155 replaced by @xmath156 in eq.([p2nn ] ) . numerical calculations are performed for the magnetoresistivity @xmath157 of surface state in a uniform ti bi@xmath0se@xmath1 . at zero temperature the elastic scattering contributing to the resistivity is modeled by a coulomb potential due to charged impurities:@xcite @xmath158 with @xmath159 being the impurity density , which is determined by the zero - magnetic - field mobility @xmath5 . at temperatures higher than @xmath160,@xcite phonon scatterings play increasingly important role and the dominant inelastic contribution comes from optical phonons . for this polar material , the scattering by optical phonons via the deformation potential can be neglected . hence , we take account of inelastic scattering from optical phonons via frhlich coupling : @xmath161 . in the numerical calculation we use the following parameters:@xcite fermi velocity @xmath162 , static dielectric constant @xmath163 , optical dielectric constant @xmath164 , and phonon energy @xmath165 . the broadening parameter is taken to be @xmath166 . as a function of the magnetic field @xmath2 having different effective g - factors : @xmath167 and @xmath168 for a ti surface system with electron sheet density @xmath169 in the cases of zero - magnetic - field mobility @xmath170 ( a ) and @xmath171 ( b ) . several integer - number positions of filling factor @xmath172 are marked in ( b).,scaledwidth=40.0% ]    fig.[diffg ] shows the calculated magnetoresistivity @xmath157 versus the magnetic field strength @xmath2 for a ti surface system with electron sheet density @xmath169 but having different effective g - factors : @xmath167 and @xmath168 for two values of zero - magnetic - field mobility @xmath170 and @xmath171 , representing different degree of landau - level broadening . in the case without zeeman splitting ( @xmath131 ) the resistivity @xmath157 exhibits almost no change with changing magnetic field up to 10 t , except the shubnikov - de haas ( sdh ) oscillation showing up in the case of @xmath171 . this kind of magnetoresistance behavior was indeed seen experimentally in the electron - hole symmetrical massless system of single - layer graphene.@xcite in the case of a positive g - factor , @xmath173 , the magnetoresistivity increases linearly with increasing magnetic field ; while for a negative g - factor , @xmath174 , the magnetoresistivity decreases linearly with increasing magnetic field . is shown as a function of the magnetic field @xmath2 for different values of zero - magnetic - field mobility : ( a ) @xmath175 , ( b ) @xmath176 , ( c ) @xmath177 , ( d ) @xmath178 , ( e ) @xmath179 , and ( f ) @xmath180 . the inset of ( a ) illustrates the same for a larger magnetic - field range @xmath181 . the filling factor @xmath182 is plotted versus the magnetic field in ( f ) ; and several integer - number positions of @xmath182 are also marked in ( d ) and ( e ) . here the surface electron density @xmath169 and the lattice temperature @xmath183.,scaledwidth=47.0% ]    in the following we will give more detailed examination on the linearly increasing magnetoresistance in the positive @xmath30 case . fig.[rhob ] shows the calculated resistivity @xmath157 versus the magnetic field strength @xmath2 at lattice temperature @xmath183 for system of carrier sheet density @xmath169 and @xmath173 , having different zero - field mobility @xmath184 and @xmath180 . all resistivity curves for mobility @xmath185 exhibit clear linearity in the magnetic - field range and appear no tendency of saturation at the highest field shown in the figure . especially , for the case @xmath170 , the linear behavior extends even up to the magnetic field of @xmath186 , as illustrated in the inset of fig.[rhob](a ) . this feature contradicts the classical mr which saturates at sufficiently large magnetic field @xmath187 . note that here we only present the calculated @xmath157 for magnetic field @xmath2 larger than @xmath188 t , for which a sufficient energy gap @xmath135 is assumed to open that with further increase of the magnetic field the states in the `` + ' ' -branch levels no longer shrink into the zero level and thus it should be excluded from the conduction band . this is of course not true for very weak magnetic field . when @xmath189 the energy gap @xmath190 , the situation becomes similar to the case of @xmath131 : the whole upper half of the zero - level states are available to electron occupation and we should have a flat resistivity @xmath157 when changing magnetic field . with increasing @xmath2 the portion of the zero - level states available to conduction electrons decreases until the magnetic field reaches @xmath191 . as a result the resistivity @xmath157 should exhibit a crossover from a flat changing at small @xmath2 to positively linear increasing at @xmath192 . this is just the behavior observed in the ti bi@xmath0se@xmath1.@xcite    note that in the case of @xmath170 , the broadened landau - level widths are always larger than the neighboring level interval : @xmath193 , which requires @xmath194 ^ 2 $ ] , even for the lowest landau level @xmath195 , i.e. the whole landau - level spectrum is smeared . with increasing the zero - field mobility the magnitude of resistivity @xmath157 decreases , and when the broadened landau - level width becomes smaller than the neighboring level interval , @xmath196 , a weak sdh oscillation begin to occur around the linearly - dependent average value of @xmath157 at higher portion of the magnetic field range , as seen in fig.[rhob](c ) , ( d ) and ( e ) for @xmath197 and @xmath198 . on the other hand , in the case of large mobility , e.g. @xmath199 , where the broadened landau - level widths @xmath200 are much smaller than the neighboring level interval even for level index @xmath120 as large as @xmath201 , the magnetoresistivity shows pronounced sdh oscillation and the linear - dependent behavior disappears , before the appearance of quantum hall effect,@xcite as shown in fig.[rhob](f ) . abrikosov s model for the lmr requires the applied magnetic field large enough to reach the quantum limit at which all the carriers are within the lowest landau level,@xcite while it is obvious that more than one landau levels are occupied in the experimental samples in the field range in which the linear and non - saturating magnetoresistivity was observed.@xcite for the given electron surface density @xmath202 , the number of occupied landau levels , or the filling factor @xmath172 , at different magnetic fields is shown in fig.[rhob](f ) , as well as in the fig.[rhob](d ) and ( e ) , where the integer - number positions of @xmath203 , i.e. filling up to entire @xmath182 landau levels , coincide with the minima of the density - of - states or the dips of sdh oscillation . this is in contrast with @xmath131 case , where the integer number of @xmath203 , which implies a filling up to the center position of the @xmath182th landau levels , locates at a peak of sdh oscillation , as shown in fig.[diffg]b . the observed sdh oscillations in the bi@xmath0se@xmath1 nanoribbon exhibiting nonsaturating surface lmr in the experiment@xcite favor the former case : a finite positive effective @xmath133 .     is plotted as a function of the surface electron density @xmath33 at magnetic field @xmath204 : ( a ) at different values of zero - field mobility @xmath5 , and ( b ) at different values of zero - field conductivity @xmath205.,scaledwidth=40.0% ]     at various lattice temperatures . here the zero - magnetic - field mobility at zero temperature is @xmath206.,scaledwidth=35.0% ]    next , we examine the density - dependence of the linear magnetoresistivity . to compare with abrikosov s quantum magnetoresistance which suggests a @xmath207 behavior,@xcite we show the calculated @xmath208 for above lmr versus the carrier sheet density @xmath33 in fig.[rhon ] at fixed magnetic field @xmath209 t . the mobility is taken respectively to be @xmath210 and @xmath211m@xmath212/vs to make the resistivity in the lmr regime . a clearly linear dependence of @xmath213 on the surface density @xmath33 is seen in all cases , indicating that this non - saturating linear resistivity is almost inversely proportional to the carrier density . in the figure we also show @xmath208 versus @xmath33 under the condition of different given conductivity @xmath214 and @xmath215 . in this case the half - width @xmath216 is independent of surface density . the linear dependence still holds , indicating that this linear behavior is not sensitive to the modest @xmath33-dependence of landau level broadening @xmath216 as long as the system is in the overlapped landau level regime . from the above discussion , it is obvious that lmr shows up in the system having overlapped landau levels and the separation of landau levels makes the mr departure from the linear increase . at high temperature , the thermal energy would smear the level separation and phonon scatterings further broaden landau levels . hence , it is believed that this lmr will be robust against raising temperature . this is indeed the case as seen in fig.[rhot ] , where we plot the calculated magnetoresistivity @xmath157 for the above system with zero - temperature linear mobility @xmath217m@xmath212/vs versus the magnetic field at different lattice temperatures . we can see that raising temperature to room temperature has little effect on the linearity of mr . due to the decreased mobility at higher temperature from phonon scattering , the weak sdh oscillation on the linear background tends to vanish . these features are in good agreement with the experimental report.@xcite in summary , we have studied the two - dimensional magnetotransport in the flat surface of a three - dimensional ti , which arises from the surface states with a wavevector - linear energy dispersion and a finite , positive zeeman splitting within the bulk energy gap . when the level broadening is comparable to or larger than the landau - level separation and the conduction electrons spread over many landau levels , a positive , dominantly linear and non - saturating magnetoresistance appears within a quite wide range of magnetic field and persists up to room temperature . this remarkable lmr provides a possible mechanism for the recently observed linear magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons.@xcite    in contrast to quantum hall effect which appears in the case of well formed landau levels and to abrikosov s quantum magnetotransport,@xcite which is limited to the extreme quantum limit that all electrons coalesce into the lowest landau level , the discussed lmr is a phenomena of pure classical two - dimensional magnetotransport in a system having linear - energy - dispersion , appearing in the regime of overlapped landau levels , irrespective of its showing up in relatively high magnetic field range . furthermore , the present scheme deals with spatially uniform case without invoking the mobility fluctuation in a strongly inhomogeneous system , which is required in the classical parish and littlewood model to produce a lmr.@xcite    the appearance of this significant positive - increasing linear magnetoresistance depends on the existence of a positive and sizable effective g - factor . if the zeeman energy splitting is quite small the resistivity @xmath157 would exhibit little change with changing magnetic field . in the case of a negative and sizable effective g - factor the magnetoresistivity would decrease linearly with increasing magnetic field . therefore , the behavior of the longitudinal resistivity versus magnetic field may provide a useful way for judging the direction and the size of the effective zeeman energy splitting in ti surface states . this work was supported by the national science foundation of china ( grant no . 11104002 ) , the national basic research program of china ( grant no . 2012cb927403 ) and by the program for science&technology innovation talents in universities of henan province ( grant no . 2012hastit029 ) ."""
-
-        inputs = tokenizer(
-            [ARTICLE_LEP, ARTICLE_MAGNET],
-            max_length=1024,
-            padding="max_length",
-            truncation=True,
-            return_tensors="ms",
-        )
-        inputs = {k: inputs[k] for k in inputs}
-
-        hypotheses_batch = model.generate(**inputs)
-
-        EXPECTED_LEP = (
-            "we study the rare decays @xmath0 ( @xmath1 ) at the gigaz option of the international linear collider "
-            "( ilc ).<n> we calculate the branching ratios of @xmath2 in the two higgs doublet model ( 2hdm ), the "
-            "minimal supersymmetric standard model ( mssm ), the next - to - minimal supersymmetric standard model "
-            "( nmssm ) and the nearly minimal supersymmetric standard model ( nmssm ).<n> we find that the branching "
-            "ratios of @xmath3 can reach @xmath4 in 2hdm, @xmath5 in mssm, @xmath6 in nmssm and @xmath7 in nmssm, "
-            "while they are much smaller than @xmath8 in 2hdm, @xmath9 in mssm, @xmath10 in nmssm and @xmath11 in "
-            "nmssm."
-        )
-
-        EXPECTED_MAGNET = (
-            "we investigate the two - dimensional magnetotransport in the surface state of a topological insulator "
-            "( ti ).<n> we find that a positive, nonsaturating and dominantly linear magnetoresistance can appear "
-            "within quite wide magnetic - field range in the ti surface state having a positive and finite effective g "
-            "- factor.<n> this linear magnetoresistance shows up in the system of high carrier concentration and low "
-            "mobility when electrons are in extended states and spread over many smeared landau levels, and persists "
-            "up to room temperature, providing a possible mechanism for the recently observed linear magnetoresistance "
-            "in topological insulator bi@xmath0se@xmath1 nanoribbons."
-        )
-
-        generated = tokenizer.batch_decode(
-            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )
-
-        self.assertTrue(generated == [EXPECTED_LEP, EXPECTED_MAGNET])
-
-
-class BigBirdPegasusStandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=7,
-        d_model=32,
-        decoder_seq_length=7,
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=2,
-        decoder_ffn_dim=32,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=False,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        scope=None,
-        attention_type="original_full",
-        use_bias=True,
-        block_size=16,
-        num_random_blocks=3,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.is_encoder_decoder = is_encoder_decoder
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-        self.attention_type = attention_type
-        self.use_bias = use_bias
-        self.block_size = block_size
-        self.num_random_blocks = num_random_blocks
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = BigBirdPegasusConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_attention_heads=self.encoder_attention_heads,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-            is_encoder_decoder=self.is_encoder_decoder,
-            attention_type=self.attention_type,
-            use_bias=self.use_bias,
-            block_size=self.block_size,
-            num_random_blocks=self.num_random_blocks,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        )
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = BigBirdPegasusDecoder(config=config).eval()
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        model = BigBirdPegasusDecoder(config=config).eval()
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        # big bird has extremely high logits which requires
-        # such a high error tolerance here
-        assert ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=5e-1)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, lm_labels = config_and_inputs
-
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class BigBirdPegasusStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (BigBirdPegasusDecoder, BigBirdPegasusForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (BigBirdPegasusForCausalLM,) if is_mindspore_available() else ()
-    test_pruning = False
-    is_encoder_decoder = False
-
-    def setUp(
-        self,
-    ):
-        self.model_tester = BigBirdPegasusStandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=BigBirdPegasusConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_attn_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    @unittest.skip("Decoder cannot retain gradients")
-    def test_retain_grad_hidden_states_attentions(self):
-        return
\ No newline at end of file
diff --git a/tests/transformers/models/biogpt/__init__.py b/tests/transformers/models/biogpt/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/biogpt/test_modeling_biogpt.py b/tests/transformers/models/biogpt/test_modeling_biogpt.py
deleted file mode 100644
index 2661f43fa..000000000
--- a/tests/transformers/models/biogpt/test_modeling_biogpt.py
+++ /dev/null
@@ -1,455 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore BioGPT model."""
-
-import math
-import unittest
-
-from mindnlp.transformers import BioGptConfig, is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        BioGptForCausalLM,
-        BioGptForSequenceClassification,
-        BioGptForTokenClassification,
-        BioGptModel,
-        BioGptTokenizer,
-    )
-
-
-class BioGptModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return BioGptConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = BioGptModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = BioGptForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_biogpt_model_attention_mask_past(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = BioGptModel(config=config)
-        model.eval()
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-        half_seq_length = self.seq_length // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_biogpt_model_past_large_inputs(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = BioGptModel(config=config).eval()
-
-        attention_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_forward_and_backwards(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
-    ):
-        model = BioGptForCausalLM(config)
-        if gradient_checkpointing:
-            model.gradient_checkpointing_enable()
-
-        result = model(input_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        result.loss.backward()
-
-    def create_and_check_biogpt_weight_initialization(self, config, *args):
-        model = BioGptModel(config)
-        model_std = model.config.initializer_range / math.sqrt(2 * model.config.num_hidden_layers)
-        for key in model.state_dict().keys():
-            if "c_proj" in key and "weight" in key:
-                self.parent.assertLessEqual(abs(ops.std(model.state_dict()[key]) - model_std), 0.001)
-                self.parent.assertLessEqual(abs(ops.mean(model.state_dict()[key]) - 0.0), 0.01)
-
-    def create_and_check_biogpt_for_token_classification(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        config.num_labels = self.num_labels
-        model = BioGptForTokenClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class BioGptModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (BioGptModel, BioGptForCausalLM, BioGptForSequenceClassification, BioGptForTokenClassification)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (BioGptForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": BioGptModel,
-            "text-classification": BioGptForSequenceClassification,
-            "text-generation": BioGptForCausalLM,
-            "token-classification": BioGptForTokenClassification,
-            "zero-shot": BioGptForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = BioGptModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BioGptConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_biogpt_model_att_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_biogpt_model_attention_mask_past(*config_and_inputs)
-
-    @unittest.skip
-    def test_biogpt_gradient_checkpointing(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
-
-    def test_biogpt_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_biogpt_model_past_large_inputs(*config_and_inputs)
-
-    def test_biogpt_weight_initialization(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_biogpt_weight_initialization(*config_and_inputs)
-
-    def test_biogpt_token_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_biogpt_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_batch_generation(self):
-        model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
-        tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
-
-        tokenizer.padding_side = "left"
-
-        # Define PAD Token = EOS Token = 50256
-        tokenizer.pad_token = tokenizer.eos_token
-        model.config.pad_token_id = model.config.eos_token_id
-
-        # use different length sentences to test batching
-        sentences = [
-            "Hello, my dog is a little",
-            "Today, I",
-        ]
-
-        inputs = tokenizer(sentences, return_tensors="ms", padding=True)
-        input_ids = inputs["input_ids"]
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"],
-        )
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="ms").input_ids
-        output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
-        inputs_padded = tokenizer(sentences[1], return_tensors="ms").input_ids
-        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "Hello, my dog is a little bit bigger than a little bit.",
-            "Today, I have a good idea of how to use the information",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/biogpt"
-        model = BioGptModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    # Copied from tests.models.opt.test_modeling_opt.OPTModelTest.test_opt_sequence_classification_model with OPT->BioGpt,opt->biogpt,prepare_config_and_inputs->prepare_config_and_inputs_for_common
-    def test_biogpt_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = BioGptForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    # Copied from tests.models.opt.test_modeling_opt.OPTModelTest.test_opt_sequence_classification_model_for_multi_label with OPT->BioGpt,opt->biogpt,prepare_config_and_inputs->prepare_config_and_inputs_for_common
-    def test_biogpt_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(mindspore.float32)
-        model = BioGptForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-
-@require_mindspore
-class BioGptModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_lm_head_model(self):
-        model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
-        input_ids = mindspore.tensor([[2, 4805, 9, 656, 21]])
-        output = model(input_ids)[0]
-
-        vocab_size = 42384
-
-        expected_shape = (1, 5, vocab_size)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[[-9.5236, -9.8918, 10.4557], [-11.0469, -9.6423, 8.1022], [-8.8664, -7.8826, 5.5325]]]
-        )
-
-        self.assertTrue(ops.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_biogpt_generation(self):
-        tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
-        model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
-
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        tokenized = tokenizer("COVID-19 is", return_tensors="ms")
-        output_ids = model.generate(
-            **tokenized,
-            min_length=100,
-            max_length=1024,
-            num_beams=5,
-            early_stopping=True,
-        )
-        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-
-        EXPECTED_OUTPUT_STR = (
-            "COVID-19 is a global pandemic caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), the"
-            " causative agent of coronavirus disease 2019 (COVID-19), which has spread to more than 200 countries and"
-            " territories, including the United States (US), Canada, Australia, New Zealand, the United Kingdom (UK),"
-            " and the United States of America (USA), as of March 11, 2020, with more than 800,000 confirmed cases and"
-            " more than 800,000 deaths."
-        )
-        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
\ No newline at end of file
diff --git a/tests/transformers/models/bit/__init__.py b/tests/transformers/models/bit/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/bit/test_modeling_bit.py b/tests/transformers/models/bit/test_modeling_bit.py
deleted file mode 100644
index efef45867..000000000
--- a/tests/transformers/models/bit/test_modeling_bit.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore Bit model. """
-import unittest
-import numpy as np
-from mindnlp.transformers import BitConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_backbone_common import BackboneTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import nn, ops
-
-    from mindnlp.transformers import BitBackbone, BitForImageClassification, BitImageProcessor, BitModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-class BitModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,
-        image_size=32,
-        num_channels=3,
-        embeddings_size=10,
-        hidden_sizes=[8, 16, 32, 64],
-        depths=[1, 1, 2, 1],
-        is_training=True,
-        use_labels=True,
-        hidden_act="relu",
-        num_labels=3,
-        scope=None,
-        out_features=["stage2", "stage3", "stage4"],
-        out_indices=[2, 3, 4],
-        num_groups=1,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.embeddings_size = embeddings_size
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_act = hidden_act
-        self.num_labels = num_labels
-        self.scope = scope
-        self.num_stages = len(hidden_sizes)
-        self.out_features = out_features
-        self.out_indices = out_indices
-        self.num_groups = num_groups
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return BitConfig(
-            num_channels=self.num_channels,
-            embeddings_size=self.embeddings_size,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            hidden_act=self.hidden_act,
-            num_labels=self.num_labels,
-            out_features=self.out_features,
-            out_indices=self.out_indices,
-            num_groups=self.num_groups,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = BitModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = BitForImageClassification(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = BitBackbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 4, 4])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-        self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = BitBackbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[-1], 1, 1])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-        self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class BitModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Bit does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (BitModel, BitForImageClassification, BitBackbone) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"image-feature-extraction": BitModel, "image-classification": BitForImageClassification}
-        if is_mindspore_available()
-        else {}
-    )
-
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = BitModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BitConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    @unittest.skip(reason="Bit does not output attentions")
-    def test_attention_outputs(self):
-        pass
-
-    @unittest.skip(reason="Bit does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Bit does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_backbone(*config_and_inputs)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            for name, module in model.cells_and_names():
-                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
-                    self.assertTrue(
-                        ops.all(module.weight == 1),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-                    self.assertTrue(
-                        ops.all(module.bias == 0),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_stages = self.model_tester.num_stages
-            self.assertEqual(len(hidden_states), expected_num_stages + 1)
-
-            # Bit's feature maps are of shape (batch_size, num_channels, height, width)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        layers_type = ["preactivation", "bottleneck"]
-        for model_class in self.all_model_classes:
-            for layer_type in layers_type:
-                config.layer_type = layer_type
-                inputs_dict["output_hidden_states"] = True
-                check_hidden_states_output(inputs_dict, config, model_class)
-
-                # check that output_hidden_states also work using config
-                del inputs_dict["output_hidden_states"]
-                config.output_hidden_states = True
-
-                check_hidden_states_output(inputs_dict, config, model_class)
-
-    @unittest.skip(reason="Bit does not use feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/bit-50"
-        model = BitModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class BitModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return BitImageProcessor.from_pretrained("google/bit-50") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = BitForImageClassification.from_pretrained("google/bit-50")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([[-0.6526, -0.5263, -1.4398]])
-        print(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy())
-        self.assertTrue(np.allclose(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-
-@require_mindspore
-class BitBackboneTest(BackboneTesterMixin, unittest.TestCase):
-    all_model_classes = (BitBackbone,) if is_mindspore_available() else ()
-    config_class = BitConfig
-
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = BitModelTester(self)
diff --git a/tests/transformers/models/blenderbot/__init__.py b/tests/transformers/models/blenderbot/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/blenderbot/test_modeling_blenderbot.py b/tests/transformers/models/blenderbot/test_modeling_blenderbot.py
deleted file mode 100644
index af0388253..000000000
--- a/tests/transformers/models/blenderbot/test_modeling_blenderbot.py
+++ /dev/null
@@ -1,568 +0,0 @@
-# coding=utf-8
-# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Blenderbot model. """
-
-import tempfile
-import unittest
-
-import numpy as np
-from mindnlp.transformers import BlenderbotConfig
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    is_mindspore_available,
-    slow,
-)
-from mindnlp.utils import cached_property
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import BlenderbotForConditionalGeneration, BlenderbotModel, BlenderbotTokenizer
-    from mindnlp.transformers.models.blenderbot.modeling_blenderbot import (
-        BlenderbotDecoder,
-        BlenderbotEncoder,
-        BlenderbotForCausalLM,
-    )
-
-
-def prepare_blenderbot_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(config.encoder_layers, config.encoder_attention_heads)
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-class BlenderbotModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=50,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-        # forcing a certain token to be generated, sets all other tokens to -inf
-        # if however the token to be generated is already at -inf then it can lead token
-        # `nan` values and thus break generation
-        self.forced_bos_token_id = None
-        self.forced_eos_token_id = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-        inputs_dict = prepare_blenderbot_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def get_config(self):
-        return BlenderbotConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            forced_bos_token_id=self.forced_bos_token_id,
-            forced_eos_token_id=self.forced_eos_token_id,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.max_position_embeddings = 100
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = BlenderbotModel(config=config).get_decoder().set_train(False)
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-        head_mask = inputs_dict["head_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask.astype(mindspore.bool_)], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = BlenderbotModel(config=config).set_train(False)
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = BlenderbotEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
-            0
-        ]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = BlenderbotDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-@require_mindspore
-class BlenderbotModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (BlenderbotModel, BlenderbotForConditionalGeneration) if is_mindspore_available() else ()
-    all_generative_model_classes = (BlenderbotForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "conversational": BlenderbotForConditionalGeneration,
-            "feature-extraction": BlenderbotModel,
-            "summarization": BlenderbotForConditionalGeneration,
-            "text-generation": BlenderbotForCausalLM,
-            "text2text-generation": BlenderbotForConditionalGeneration,
-            "translation": BlenderbotForConditionalGeneration,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    fx_compatible = True
-    test_pruning = False
-    test_missing_keys = False
-
-    def setUp(self):
-        self.model_tester = BlenderbotModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BlenderbotConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    @require_mindspore
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = BlenderbotForConditionalGeneration(config).set_train(False)
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-
-def assert_tensors_close(a, b, atol=1e-12, prefix=""):
-    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if np.allclose(a.asnumpy(), b.asnumpy(), atol=atol):
-            return True
-        raise
-    except Exception:
-        pct_different = (ops.gt((a - b).abs(), atol)).float().mean().item()
-        if a.numel() > 100:
-            msg = f"tensor values are {pct_different:.1%} percent different."
-        else:
-            msg = f"{a} != {b}"
-        if prefix:
-            msg = prefix + ": " + msg
-        raise AssertionError(msg)
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class Blenderbot3BIntegrationTests(unittest.TestCase):
-    ckpt = "facebook/blenderbot-3B"
-
-    @cached_property
-    def tokenizer(self):
-        return BlenderbotTokenizer.from_pretrained(self.ckpt)
-
-    @slow
-    def test_generation_from_short_input_same_as_parlai_3B(self):
-        FASTER_GEN_KWARGS = {"num_beams": 1, "early_stopping": True, "min_length": 15, "max_length": 25}
-        TOK_DECODE_KW = {"skip_special_tokens": True, "clean_up_tokenization_spaces": True}
-
-        model = BlenderbotForConditionalGeneration.from_pretrained(self.ckpt)
-        src_text = ["Sam"]
-        model_inputs = self.tokenizer(src_text, return_tensors="ms")
-
-        generated_utterances = model.generate(**model_inputs, **FASTER_GEN_KWARGS)
-        tgt_text = 'Sam is a great name. It means "sun" in Gaelic.'
-
-        generated_txt = self.tokenizer.batch_decode(generated_utterances, **TOK_DECODE_KW)
-        assert generated_txt[0].strip() == tgt_text
-
-        src_text = (
-            "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel"
-            " like i'm going to throw up.\nand why is that?"
-        )
-
-        model_inputs = self.tokenizer([src_text], return_tensors="ms")
-
-        generated_ids = model.generate(**model_inputs, **FASTER_GEN_KWARGS)[0]
-        reply = self.tokenizer.decode(generated_ids, **TOK_DECODE_KW)
-
-        assert "I think it's because we are so worried about what people think of us." == reply.strip()
-        del model
-
-
-class BlenderbotStandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        d_model=16,
-        decoder_seq_length=7,
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=2,
-        decoder_ffn_dim=32,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=False,
-        encoder_no_repeat_ngram_size=0,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.is_encoder_decoder = is_encoder_decoder
-        self.encoder_no_repeat_ngram_size = encoder_no_repeat_ngram_size
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = BlenderbotConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_attention_heads=self.encoder_attention_heads,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-            is_encoder_decoder=self.is_encoder_decoder,
-            encoder_no_repeat_ngram_size=self.encoder_no_repeat_ngram_size,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        )
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = BlenderbotDecoder(config=config).set_train(False)
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3)
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        model = BlenderbotDecoder(config=config).set_train(False)
-
-        # create attention mask
-        attn_mask = ops.ones(*input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-        #        past_key_values = model(input_ids, use_cache=True)["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones(attn_mask.shape[0], 1, dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class BlenderbotStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (BlenderbotDecoder, BlenderbotForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (BlenderbotForCausalLM,) if is_mindspore_available() else ()
-    test_pruning = False
-    is_encoder_decoder = False
-
-    def setUp(
-        self,
-    ):
-        self.model_tester = BlenderbotStandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=BlenderbotConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_attn_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
-        return
diff --git a/tests/transformers/models/blenderbot_small/__init__.py b/tests/transformers/models/blenderbot_small/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/transformers/models/blenderbot_small/test_modeling_blenderbot_small.py
deleted file mode 100644
index e21e8280b..000000000
--- a/tests/transformers/models/blenderbot_small/test_modeling_blenderbot_small.py
+++ /dev/null
@@ -1,571 +0,0 @@
-# coding=utf-8
-# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch BlenderbotSmall model. """
-
-import tempfile
-import unittest
-
-import numpy as np
-from mindnlp.transformers import BlenderbotSmallConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-    is_mindspore_available
-)
-from mindnlp.utils import cached_property
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import BlenderbotSmallForConditionalGeneration, BlenderbotSmallModel, BlenderbotSmallTokenizer
-    from mindnlp.transformers.models.blenderbot_small.modeling_blenderbot_small import (
-        BlenderbotSmallDecoder,
-        BlenderbotSmallEncoder,
-        BlenderbotSmallForCausalLM,
-    )
-
-
-def prepare_blenderbot_small_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(config.encoder_layers, config.encoder_attention_heads)
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-class BlenderbotSmallModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=50,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-        # forcing a certain token to be generated, sets all other tokens to -inf
-        # if however the token to be generated is already at -inf then it can lead token
-        # `nan` values and thus break generation
-        self.forced_bos_token_id = None
-        self.forced_eos_token_id = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-        inputs_dict = prepare_blenderbot_small_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def get_config(self):
-        return BlenderbotSmallConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            forced_bos_token_id=self.forced_bos_token_id,
-            forced_eos_token_id=self.forced_eos_token_id,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = BlenderbotSmallModel(config=config).get_decoder().set_train(False)
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-        head_mask = inputs_dict["head_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask.astype(mindspore.bool_)], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = BlenderbotSmallModel(config=config).set_train(False)
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = BlenderbotSmallEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
-            0
-        ]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = BlenderbotSmallDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-@require_mindspore
-class BlenderbotSmallModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (BlenderbotSmallModel, BlenderbotSmallForConditionalGeneration) if is_mindspore_available() else ()
-    all_generative_model_classes = (BlenderbotSmallForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "conversational": BlenderbotSmallForConditionalGeneration,
-            "feature-extraction": BlenderbotSmallModel,
-            "summarization": BlenderbotSmallForConditionalGeneration,
-            "text-generation": BlenderbotSmallForCausalLM,
-            "text2text-generation": BlenderbotSmallForConditionalGeneration,
-            "translation": BlenderbotSmallForConditionalGeneration,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    fx_compatible = True
-    test_pruning = False
-    test_missing_keys = False
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return pipeline_test_casse_name in ("TextGenerationPipelineTests", "ConversationalPipelineTests")
-
-    def setUp(self):
-        self.model_tester = BlenderbotSmallModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BlenderbotSmallConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    @require_mindspore
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = BlenderbotSmallForConditionalGeneration(config).set_train(False)
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-
-def assert_tensors_close(a, b, atol=1e-12, prefix=""):
-    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if np.allclose(a.asnumpy(), b.asnumpy(), atol=atol):
-            return True
-        raise
-    except Exception:
-        pct_different = (ops.gt((a - b).abs(), atol)).float().mean().item()
-        if a.numel() > 100:
-            msg = f"tensor values are {pct_different:.1%} percent different."
-        else:
-            msg = f"{a} != {b}"
-        if prefix:
-            msg = prefix + ": " + msg
-        raise AssertionError(msg)
-
-
-@require_mindspore
-class Blenderbot90MIntegrationTests(unittest.TestCase):
-    ckpt = "facebook/blenderbot-90M"
-
-    @cached_property
-    def model(self):
-        model = BlenderbotSmallForConditionalGeneration.from_pretrained(self.ckpt)
-        if mindspore.get_context('device_target') != "CPU":
-            model = model.half()
-        return model
-
-    @cached_property
-    def tokenizer(self):
-        return BlenderbotSmallTokenizer.from_pretrained(self.ckpt)
-
-    @slow
-    def test_90_generation_from_long_input(self):
-        src_text = [
-            "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel"
-            " like       i'm going to throw up.\nand why is that?"
-        ]
-
-        model_inputs = self.tokenizer(src_text, return_tensors="ms")
-
-        assert isinstance(self.tokenizer, BlenderbotSmallTokenizer)
-        generated_ids = self.model.generate(**model_inputs)[0]
-        reply = self.tokenizer.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-
-        assert reply in (
-            "i don't know. i just feel like i'm going to throw up. it's not fun.",
-            "i'm not sure. i just feel like i've been feeling like i have to be in a certain place",
-        )
-
-    @slow
-    def test_90_generation_from_short_input(self):
-        model_inputs = self.tokenizer(["sam"], return_tensors="ms")
-
-        generated_utterances = self.model.generate(**model_inputs)
-
-        clean_txt = self.tokenizer.decode(
-            generated_utterances[0], skip_special_tokens=True, clean_up_tokenization_spaces=True
-        )
-        assert clean_txt in (
-            "have you ever been to a sam club? it's a great club in the south.",
-            "have you ever heard of sam harris? he's an american singer, songwriter, and actor.",
-        )
-
-
-class BlenderbotSmallStandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        d_model=16,
-        decoder_seq_length=7,
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=2,
-        decoder_ffn_dim=32,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=False,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.is_encoder_decoder = is_encoder_decoder
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = BlenderbotSmallConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_attention_heads=self.encoder_attention_heads,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        )
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = BlenderbotSmallDecoder(config=config).set_train(False)
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3)
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        model = BlenderbotSmallDecoder(config=config).set_train(False)
-
-        # create attention mask
-        attn_mask = ops.ones(*input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones(attn_mask.shape[0], 1, dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class BlenderbotSmallStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (BlenderbotSmallDecoder, BlenderbotSmallForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (BlenderbotSmallForCausalLM,) if is_mindspore_available() else ()
-    test_pruning = False
-    is_encoder_decoder = False
-
-    def setUp(
-        self,
-    ):
-        self.model_tester = BlenderbotSmallStandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=BlenderbotSmallConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_attn_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
-        return
\ No newline at end of file
diff --git a/tests/transformers/models/blip/__init__.py b/tests/transformers/models/blip/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/blip/test_modeling_blip.py b/tests/transformers/models/blip/test_modeling_blip.py
deleted file mode 100644
index 08dc72e62..000000000
--- a/tests/transformers/models/blip/test_modeling_blip.py
+++ /dev/null
@@ -1,1077 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Blip model. """
-
-
-import inspect
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import requests
-
-from mindnlp.transformers import BlipConfig, BlipTextConfig, BlipVisionConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import (
-        BlipForConditionalGeneration,
-        BlipForImageTextRetrieval,
-        BlipForQuestionAnswering,
-        BlipModel,
-        BlipTextModel,
-        BlipVisionModel,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import BlipProcessor
-
-
-class BlipVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=1e-10,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return BlipVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = BlipVisionModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class BlipVisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Blip does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (BlipVisionModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = BlipVisionModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BlipVisionConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="Blip does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Dense))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="BlipVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="BlipVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip-vqa-base"
-        model = BlipVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class BlipTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        bos_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                ops.scatter_nd_update(input_mask,
-                                      ops.stack([ops.full((int(start_index),), batch_idx, dtype=mindspore.int64), ops.arange(int(start_index))], dim=1),
-                                      ops.full((int(start_index),), 1, dtype=mindspore.int64))
-                ops.scatter_nd_update(input_mask,
-                                      ops.stack([ops.full((input_mask.shape[1] - int(start_index),), batch_idx, dtype=mindspore.int64), ops.arange(int(input_mask.shape[1] - start_index))], dim=1),
-                                      ops.full((input_mask.shape[1] - int(start_index),), 0, dtype=mindspore.int64))
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return BlipTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            bos_token_id=self.bos_token_id,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = BlipTextModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class BlipTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (BlipTextModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = BlipTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BlipTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="Blip does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip-vqa-base"
-        model = BlipTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class BlipModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return BlipConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = BlipModel(config).set_train(False)
-        result = model(input_ids, pixel_values, attention_mask)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "return_loss": True,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class BlipModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (BlipModel,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": BlipModel,
-            "image-to-text": BlipForConditionalGeneration,
-            "visual-question-answering": BlipForQuestionAnswering,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = BlipModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="BlipModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # override as the `logit_scale` parameter initilization is different for Blip
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save BlipConfig and check if we can load BlipVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = BlipVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save BlipConfig and check if we can load BlipTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = BlipTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip-vqa-base"
-        model = BlipModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class BlipTextRetrievalModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return BlipConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = BlipModel(config).set_train(False)
-        result = model(input_ids, pixel_values, attention_mask)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-        }
-        return config, inputs_dict
-
-
-class BlipTextImageModelsModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length  # need seq_length for pt-tf equivalence test
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return BlipConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = BlipModel(config).set_train(False)
-        result = model(input_ids, pixel_values, attention_mask)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "labels": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-        }
-        return config, inputs_dict
-
-
-class BlipVQAModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = BlipTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = BlipVisionModelTester(parent, **vision_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return BlipConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = BlipModel(config).set_train(False)
-        result = model(input_ids, pixel_values, attention_mask)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "labels": input_ids,
-            "decoder_input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-@require_vision
-class BlipVQAModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (BlipForQuestionAnswering,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = BlipVQAModelTester(self)
-
-    def _prepare_inputs_for_vqa(self):
-        _, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        inputs_dict["labels"] = inputs_dict["input_ids"]
-        inputs_dict["decoder_input_ids"] = inputs_dict["input_ids"]
-        inputs_dict.pop("return_loss")
-        return inputs_dict
-
-    def test_class_name_consistency(self):
-        """
-        Tests that all VQA models have a class name that ends with "ForQuestionAnswering"
-        """
-        for model_class in self.all_model_classes:
-            model = model_class(self.model_tester.get_config())
-            self.assertTrue(
-                model.__class__.__name__.endswith("ForQuestionAnswering"),
-                f"Class name should end with 'ForVisualQuestionAnswering' got {model.__class__.__name__}",
-            )
-
-    def test_training(self):
-        """
-        Tests that all VQA models can be trained on a single batch
-        """
-        for model_class in self.all_model_classes:
-            model = model_class(self.model_tester.get_config())
-            model.set_train()
-            loss = model(**self.model_tester.prepare_config_and_inputs_for_common()[1]).loss
-
-    def test_forward_signature(self):
-        """
-        Test if the forward function has the expected arguments.
-        """
-        for model_class in self.all_model_classes:
-            model = model_class(self.model_tester.get_config())
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so args are the first n entries
-            args = list(signature.parameters.keys())
-            expected_args = [
-                "input_ids",
-                "attention_mask",
-                "labels",
-                "decoder_input_ids",
-                "decoder_attention_mask",
-            ]
-            for arg in expected_args:
-                self.assertTrue(
-                    arg in args,
-                    f"Argument {arg} of forward function signature should include {arg}. Found {args}.",
-                )
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="BlipModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-
-@require_mindspore
-class BlipTextRetrievalModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (BlipForImageTextRetrieval,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = BlipTextRetrievalModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="BlipModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = [
-                    "input_ids",
-                    "attention_mask",
-                    "decoder_input_ids",
-                    "decoder_attention_mask",
-                ]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                    if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                    else ["encoder_outputs"]
-                )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-            else:
-                expected_arg_names = ["input_ids"] if model_class != BlipForConditionalGeneration else ["pixel_values"]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes[:-1]:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            model = model_class(config)
-            model.set_train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-            # hardcode labels to be the same as input_ids
-            inputs["labels"] = inputs["input_ids"]
-
-            loss = model(**inputs).loss
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    # override as the `logit_scale` parameter initilization is different for Blip
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save BlipConfig and check if we can load BlipVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = BlipVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save BlipConfig and check if we can load BlipTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = BlipTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip-vqa-base"
-        model = BlipModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (BlipForConditionalGeneration,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = BlipTextImageModelsModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="BlipModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = [
-                    "input_ids",
-                    "attention_mask",
-                    "decoder_input_ids",
-                    "decoder_attention_mask",
-                ]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                    if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                    else ["encoder_outputs"]
-                )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-            else:
-                expected_arg_names = ["input_ids"] if model_class != BlipForConditionalGeneration else ["pixel_values"]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes[:-1]:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            model = model_class(config)
-            model.set_train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-            # hardcode labels to be the same as input_ids
-            inputs["labels"] = inputs["input_ids"]
-
-            loss = model(**inputs).loss
-
-    # override as the `logit_scale` parameter initilization is different for Blip
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save BlipConfig and check if we can load BlipVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = BlipVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save BlipConfig and check if we can load BlipTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = BlipTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip-vqa-base"
-        model = BlipModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "https://hf-mirror.com/hf-internal-testing/blip-test-image/resolve/main/demo.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_vision
-@require_mindspore
-@slow
-class BlipModelIntegrationTest(unittest.TestCase):
-    def test_inference_image_captioning(self):
-        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-        image = prepare_img()
-
-        # image only
-        inputs = processor(images=image, return_tensors="ms")
-
-        predictions = model.generate(**inputs)
-
-        # Test output
-        self.assertEqual(predictions[0].tolist(), [30522, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102])
-
-        # image and context
-        context = ["a picture of"]
-        inputs = processor(images=image, text=context, return_tensors="ms")
-
-        predictions = model.generate(**inputs)
-
-        # Test output
-        self.assertEqual(
-            predictions[0].tolist(),
-            [30522, 1037, 3861, 1997, 1037, 2450, 1998, 2014, 3899, 2006, 1996, 3509, 102],
-        )
-
-    @require_mindspore
-    def test_inference_image_captioning_fp16(self):
-        model = BlipForConditionalGeneration.from_pretrained(
-            "Salesforce/blip-image-captioning-base", ms_dtype=mindspore.float16
-        )
-        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-        image = prepare_img()
-
-        # image only
-        inputs = processor(images=image, return_tensors="ms").to(mindspore.float16)
-
-        predictions = model.generate(**inputs)
-
-        # Test output
-        self.assertEqual(predictions[0].tolist(), [30522, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102])
-
-        # image and context
-        context = ["a picture of"]
-        inputs = processor(images=image, text=context, return_tensors="ms").to(mindspore.float16)
-
-        predictions = model.generate(**inputs)
-
-        # Test output
-        self.assertEqual(
-            predictions[0].tolist(),
-            [30522, 1037, 3861, 1997, 1037, 2450, 1998, 2014, 3899, 2006, 1996, 3509, 102],
-        )
-
-    def test_inference_vqa(self):
-        model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
-        processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
-
-        image = prepare_img()
-        text = "how many dogs are in the picture?"
-
-        inputs = processor(image, text=text, return_tensors="ms")
-        out = model.generate(**inputs)
-
-        # Test output
-        self.assertEqual(out[0].tolist(), [30522, 1015, 102])
-
-    def test_inference_itm(self):
-        model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
-        processor = BlipProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
-
-        image = prepare_img()
-        text = "A woman and her dog sitting in a beach"
-
-        inputs = processor(image, text, return_tensors="ms")
-
-        out_itm = model(**inputs)
-        out = model(**inputs, use_itm_head=False)
-
-        expected_scores = mindspore.Tensor([[0.0029, 0.9971]])
-
-        self.assertTrue(np.allclose(ops.softmax(out_itm[0]).asnumpy(), expected_scores.asnumpy(), rtol=1e-3, atol=1e-3))
-        self.assertTrue(np.allclose(out[0].asnumpy(), mindspore.Tensor([[0.5162]]).asnumpy(), rtol=1e-3, atol=1e-3))
diff --git a/tests/transformers/models/blip/test_modeling_blip_text.py b/tests/transformers/models/blip/test_modeling_blip_text.py
deleted file mode 100644
index fd9525d59..000000000
--- a/tests/transformers/models/blip/test_modeling_blip_text.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore Blip model. """
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import BlipTextConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-    from mindnlp.transformers import BlipTextModel
-
-
-class BlipTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        bos_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                ops.scatter_nd_update(input_mask,
-                                      ops.stack([ops.full((int(start_index),), batch_idx, dtype=mindspore.int64), ops.arange(int(start_index))], dim=1),
-                                      ops.full((int(start_index),), 1, dtype=mindspore.int64))
-                ops.scatter_nd_update(input_mask,
-                                      ops.stack([ops.full((input_mask.shape[1] - int(start_index),), batch_idx, dtype=mindspore.int64), ops.arange(int(input_mask.shape[1] - start_index))], dim=1),
-                                      ops.full((input_mask.shape[1] - int(start_index),), 0, dtype=mindspore.int64))
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return BlipTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            bos_token_id=self.bos_token_id,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = BlipTextModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class BlipTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (BlipTextModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = BlipTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BlipTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="Blip does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip-vqa-base"
-        model = BlipTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
diff --git a/tests/transformers/models/blip_2/__init__.py b/tests/transformers/models/blip_2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/blip_2/test_modeling_blip_2.py b/tests/transformers/models/blip_2/test_modeling_blip_2.py
deleted file mode 100644
index cf7afc259..000000000
--- a/tests/transformers/models/blip_2/test_modeling_blip_2.py
+++ /dev/null
@@ -1,1009 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch BLIP-2 model. """
-
-
-import inspect
-import tempfile
-import unittest
-
-import numpy as np
-import requests
-
-from mindnlp.transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn
-
-    from mindnlp.transformers import Blip2ForConditionalGeneration, Blip2Model, Blip2VisionModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import Blip2Processor
-
-class Blip2VisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=1e-10,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return Blip2VisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = Blip2VisionModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class Blip2VisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as BLIP-2's vision encoder does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (Blip2VisionModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = Blip2VisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=Blip2VisionConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="BLIP-2's vision encoder does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Dense))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="Blip2VisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Blip2VisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip2-opt-2.7b"
-        model = Blip2VisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class Blip2QFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        bos_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[batch_idx, :start_index] = 1
-                input_mask[batch_idx, start_index:] = 0
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return Blip2QFormerConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            bos_token_id=self.bos_token_id,
-        )
-
-
-# this class is based on `OPTModelTester` found in tests/models/opt/test_modeling_opt.py
-class Blip2TextModelDecoderOnlyTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        embed_dim=16,
-        num_labels=3,
-        word_embed_proj_dim=16,
-        type_sequence_label_size=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.embed_dim = embed_dim
-        self.num_labels = num_labels
-        self.type_sequence_label_size = type_sequence_label_size
-        self.word_embed_proj_dim = word_embed_proj_dim
-        self.is_encoder_decoder = False
-
-    def prepare_config_and_inputs(self):
-        config = self.get_config()
-
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3)
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        attention_mask = input_ids.ne(self.pad_token_id)
-
-        return config, input_ids, attention_mask
-
-    def get_config(self):
-        return CONFIG_MAPPING["opt"](
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            embed_dim=self.embed_dim,
-            is_encoder_decoder=False,
-            word_embed_proj_dim=self.word_embed_proj_dim,
-        )
-
-
-# this model tester uses a decoder-only language model (OPT)
-class Blip2ForConditionalGenerationDecoderOnlyModelTester:
-    def __init__(
-        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
-    ):
-        if vision_kwargs is None:
-            vision_kwargs = {}
-        if qformer_kwargs is None:
-            qformer_kwargs = {}
-        if text_kwargs is None:
-            text_kwargs = {}
-
-        self.parent = parent
-        self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs)
-        self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
-        self.text_model_tester = Blip2TextModelDecoderOnlyTester(parent, **text_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
-        self.is_training = is_training
-        self.num_query_tokens = num_query_tokens
-
-    def prepare_config_and_inputs(self):
-        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return Blip2Config.from_vision_qformer_text_configs(
-            vision_config=self.vision_model_tester.get_config(),
-            qformer_config=self.qformer_model_tester.get_config(),
-            text_config=self.text_model_tester.get_config(),
-            num_query_tokens=self.num_query_tokens,
-        )
-
-    def create_and_check_for_conditional_generation(self, config, input_ids, attention_mask, pixel_values):
-        model = Blip2ForConditionalGeneration(config).set_train(False)
-        result = model(pixel_values, input_ids, attention_mask)
-
-        expected_seq_length = self.num_query_tokens + self.text_model_tester.seq_length
-        self.parent.assertEqual(
-            result.logits.shape,
-            (self.vision_model_tester.batch_size, expected_seq_length, self.text_model_tester.vocab_size),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "labels": input_ids,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (Blip2ForConditionalGeneration,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self)
-
-    def test_for_conditional_generation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="Blip2Model does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="There's no base Blip2Model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="There's no base Blip2Model")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_load_vision_qformer_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save Blip2Config and check if we can load Blip2VisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = Blip2VisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save Blip2Config and check if we can load Blip2QFormerConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            qformer_config = Blip2QFormerConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip2-opt-2.7b"
-        model = Blip2ForConditionalGeneration.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# this class is based on `T5ModelTester` found in tests/models/t5/test_modeling_t5.py
-class Blip2TextModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=12,
-        encoder_seq_length=7,
-        decoder_seq_length=9,
-        # For common tests
-        is_training=True,
-        use_attention_mask=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        eos_token_id=1,
-        pad_token_id=0,
-        decoder_start_token_id=0,
-        scope=None,
-        decoder_layers=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.scope = None
-        self.decoder_layers = decoder_layers
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        decoder_attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        )
-
-    def get_config(self):
-        return CONFIG_MAPPING["t5"](
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-        )
-
-
-# this model tester uses an encoder-decoder language model (T5)
-class Blip2ModelTester:
-    def __init__(
-        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
-    ):
-        if vision_kwargs is None:
-            vision_kwargs = {}
-        if qformer_kwargs is None:
-            qformer_kwargs = {}
-        if text_kwargs is None:
-            text_kwargs = {}
-
-        self.parent = parent
-        self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs)
-        self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
-        self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
-        self.is_training = is_training
-        self.num_query_tokens = num_query_tokens
-
-    def prepare_config_and_inputs(self):
-        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-        (
-            _,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = self.text_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values, decoder_input_ids, decoder_attention_mask, lm_labels
-
-    def get_config(self):
-        return Blip2Config.from_vision_qformer_text_configs(
-            vision_config=self.vision_model_tester.get_config(),
-            qformer_config=self.qformer_model_tester.get_config(),
-            text_config=self.text_model_tester.get_config(),
-            num_query_tokens=self.num_query_tokens,
-        )
-
-    def create_and_check_for_conditional_generation(
-        self, config, input_ids, attention_mask, pixel_values, decoder_input_ids, decoder_attention_mask, labels
-    ):
-        model = Blip2ForConditionalGeneration(config).set_train(False)
-        result = model(pixel_values, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
-
-        self.parent.assertEqual(
-            result.logits.shape,
-            (
-                self.vision_model_tester.batch_size,
-                self.text_model_tester.seq_length,
-                self.text_model_tester.vocab_size,
-            ),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            pixel_values,
-            decoder_input_ids,
-            decoder_attention_mask,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "labels": labels,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class Blip2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (Blip2ForConditionalGeneration, Blip2Model) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": Blip2Model,
-            "image-to-text": Blip2ForConditionalGeneration,
-            "visual-question-answering": Blip2ForConditionalGeneration,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = Blip2ModelTester(self)
-
-    def test_for_conditional_generation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="Blip2Model does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="There's no base Blip2Model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="There's no base Blip2Model")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_cpu_offload(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_load_vision_qformer_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save Blip2Config and check if we can load Blip2VisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = Blip2VisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save Blip2Config and check if we can load Blip2QFormerConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            qformer_config = Blip2QFormerConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/blip2-opt-2.7b"
-        model = Blip2ForConditionalGeneration.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_get_text_features(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        inputs_dict = {
-            "input_ids": mindspore.Tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]),
-            "attention_mask": mindspore.Tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
-            "decoder_input_ids": mindspore.Tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]),
-        }
-
-        model = Blip2Model(config)
-        model.set_train(False)
-        text_features = model.get_text_features(**inputs_dict)
-        self.assertEqual(text_features[0].shape, (1, 10, config.text_config.vocab_size))
-
-    def test_get_image_features(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"]
-
-        for key in keys_to_pop:
-            inputs_dict.pop(key)
-
-        model = Blip2Model(config)
-        model.set_train(False)
-        image_features = model.get_image_features(**inputs_dict)
-        self.assertEqual(
-            image_features[0].shape,
-            (
-                self.model_tester.vision_model_tester.batch_size,
-                self.model_tester.vision_model_tester.seq_length,
-                config.vision_config.hidden_size,
-            ),
-        )
-
-    def test_get_qformer_features(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"]
-
-        for key in keys_to_pop:
-            inputs_dict.pop(key)
-
-        model = Blip2Model(config)
-        model.set_train(False)
-        qformer_features = model.get_qformer_features(**inputs_dict)
-        self.assertEqual(
-            qformer_features[0].shape,
-            (self.model_tester.vision_model_tester.batch_size, 10, config.vision_config.hidden_size),
-        )
-
-    # override from common to deal with nested configurations (`vision_config`, `text_config` and `qformer_config`)
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for key in ["vision_config", "qformer_config", "text_config"]:
-            setattr(configs_no_init, key, _config_zero_init(getattr(configs_no_init, key)))
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "https://hf-mirror.com/hf-internal-testing/blip-test-image/resolve/main/demo.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-    return image
-
-
-@require_vision
-@require_mindspore
-@slow
-class Blip2ModelIntegrationTest(unittest.TestCase):
-    def test_inference_opt(self):
-        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
-        model = Blip2ForConditionalGeneration.from_pretrained(
-            "Salesforce/blip2-opt-2.7b", ms_dtype=mindspore.float16
-        )
-
-        # prepare image
-        image = prepare_img()
-        inputs = processor(images=image, return_tensors="ms").to(dtype=mindspore.float16)
-
-        predictions = model.generate(**inputs)
-        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
-        # Test output
-        self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118])
-        self.assertEqual("a woman sitting on the beach with a dog", generated_text)
-
-        # image and context
-        prompt = "Question: which city is this? Answer:"
-        inputs = processor(images=image, text=prompt, return_tensors="ms").to(dtype=mindspore.float16)
-        # max_length for BLIP includes prompt length from now on, use max_new_tokens
-        predictions = model.generate(**inputs, max_new_tokens=11)
-        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
-
-        # Test output
-        self.assertEqual(
-            predictions[0].tolist(),
-            [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
-        )
-        self.assertEqual(generated_text, "it's not a city, it's a beach")
-
-    def test_inference_opt_batched_beam_search(self):
-        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
-        model = Blip2ForConditionalGeneration.from_pretrained(
-            "Salesforce/blip2-opt-2.7b", ms_dtype=mindspore.float16
-        )
-
-        # prepare image
-        image = prepare_img()
-        inputs = processor(images=[image, image], return_tensors="ms").to(dtype=mindspore.float16)
-
-        predictions = model.generate(**inputs, num_beams=2)
-
-        # Test output (in this case, slightly different from greedy search)
-        self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 69, 2335, 50118])
-        self.assertEqual(predictions[1].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 69, 2335, 50118])
-
-    def test_inference_t5(self):
-        processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
-        model = Blip2ForConditionalGeneration.from_pretrained(
-            "Salesforce/blip2-flan-t5-xl", ms_dtype=mindspore.float16
-        )
-
-        # prepare image
-        image = prepare_img()
-        inputs = processor(images=image, return_tensors="ms").to(dtype=mindspore.float16)
-
-        predictions = model.generate(**inputs)
-        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
-
-        # Test output
-        self.assertEqual(predictions[0].tolist(), [0, 2335, 1556, 28, 1782, 30, 8, 2608, 1])
-        self.assertEqual("woman playing with dog on the beach", generated_text)
-
-        # image and context
-        prompt = "Question: which city is this? Answer:"
-        inputs = processor(images=image, text=prompt, return_tensors="ms").to(dtype=mindspore.float16)
-
-        predictions = model.generate(**inputs)
-        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
-
-        # Test output
-        self.assertEqual(
-            predictions[0].tolist(),
-            [0, 3, 7, 152, 67, 839, 1],
-        )
-        self.assertEqual(generated_text, "san diego")
-
-    def test_inference_t5_batched_beam_search(self):
-        processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
-        model = Blip2ForConditionalGeneration.from_pretrained(
-            "Salesforce/blip2-flan-t5-xl", ms_dtype=mindspore.float16
-        )
-
-        # prepare image
-        image = prepare_img()
-        inputs = processor(images=[image, image], return_tensors="ms").to(dtype=mindspore.float16)
-
-        predictions = model.generate(**inputs, num_beams=2)
-
-        # Test output (in this case, slightly different from greedy search)
-        self.assertEqual(predictions[0].tolist(), [0, 2335, 1556, 28, 1782, 30, 8, 2608, 1])
-        self.assertEqual(predictions[1].tolist(), [0, 2335, 1556, 28, 1782, 30, 8, 2608, 1])
-
-    @require_mindspore
-    def test_inference_opt_multi_accelerator(self):
-        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
-        model = Blip2ForConditionalGeneration.from_pretrained(
-            "Salesforce/blip2-opt-2.7b", ms_dtype=mindspore.float16
-        )
-
-        # prepare image
-        image = prepare_img()
-        inputs = processor(images=image, return_tensors="ms").to(dtype=mindspore.float16)
-
-        predictions = model.generate(**inputs)
-        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
-
-        # Test output
-        self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118])
-        self.assertEqual("a woman sitting on the beach with a dog", generated_text)
-
-        # image and context
-        prompt = "Question: which city is this? Answer:"
-        inputs = processor(images=image, text=prompt, return_tensors="ms").to(dtype=mindspore.float16)
-
-        predictions = model.generate(**inputs)
-        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
-
-        # Test output
-        self.assertEqual(
-            predictions[0].tolist(),
-            [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
-        )
-        self.assertEqual(generated_text, "it's not a city, it's a beach")
-
-    @require_mindspore
-    def test_inference_t5_multi_accelerator(self):
-        processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
-        device_map = device_map = {
-            "query_tokens": 0,
-            "vision_model": 0,
-            "language_model": 1,
-            "language_projection": 0,
-            "qformer": 0,
-        }
-
-        model = Blip2ForConditionalGeneration.from_pretrained(
-            "Salesforce/blip2-flan-t5-xl", ms_dtype=mindspore.float16
-        )
-
-        # prepare image
-        image = prepare_img()
-        inputs = processor(images=image, return_tensors="ms").to(dtype=mindspore.float16)
-
-        predictions = model.generate(**inputs)
-        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
-
-        # Test output
-        self.assertEqual(predictions[0].tolist(), [0, 2335, 1556, 28, 1782, 30, 8, 2608, 1])
-        self.assertEqual("woman playing with dog on the beach", generated_text)
-
-        # image and context
-        prompt = "Question: which city is this? Answer:"
-        inputs = processor(images=image, text=prompt, return_tensors="ms").to(dtype=mindspore.float16)
-
-        predictions = model.generate(**inputs)
-        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
-
-        # Test output
-        self.assertEqual(
-            predictions[0].tolist(),
-            [0, 3, 7, 152, 67, 839, 1],
-        )
-        self.assertEqual(generated_text, "san diego")
\ No newline at end of file
diff --git a/tests/transformers/models/bloom/__init__.py b/tests/transformers/models/bloom/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/bloom/test_modeling_bloom.py b/tests/transformers/models/bloom/test_modeling_bloom.py
deleted file mode 100644
index f2fa9be2c..000000000
--- a/tests/transformers/models/bloom/test_modeling_bloom.py
+++ /dev/null
@@ -1,819 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import math
-import unittest
-
-from mindnlp.transformers import BloomConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        BloomForCausalLM,
-        BloomForQuestionAnswering,
-        BloomForSequenceClassification,
-        BloomForTokenClassification,
-        BloomModel,
-        BloomTokenizerFast,
-    )
-
-@require_mindspore
-class BloomModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_token_type_ids=False,
-        use_input_mask=True,
-        use_labels=True,
-        use_mc_token_ids=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.use_mc_token_ids = use_mc_token_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_dropout_prob = attention_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = None
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-    def get_large_model_config(self):
-        return BloomConfig.from_pretrained("bigscience/bloom")
-
-    def prepare_config_and_inputs(self, gradient_checkpointing=False):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config(gradient_checkpointing=gradient_checkpointing)
-
-        return (config, input_ids, input_mask, sequence_labels)
-
-    def get_config(self, gradient_checkpointing=False, slow_but_exact=True):
-        return BloomConfig(
-            vocab_size=self.vocab_size,
-            seq_length=self.seq_length,
-            hidden_size=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            hidden_dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            num_labels=self.num_labels,
-            gradient_checkpointing=gradient_checkpointing,
-            slow_but_exact=slow_but_exact,
-            dtype="float32",
-        )
-
-    def create_and_check_bloom_model(self, config, input_ids, input_mask, *args):
-        model = BloomModel(config=config)
-        model.eval()
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(len(result.past_key_values), config.n_layer)
-
-    def create_and_check_bloom_model_past(self, config, input_ids, input_mask, *args):
-        model = BloomModel(config=config)
-
-        model.eval()
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=ops.ones_like(input_ids), use_cache=True)
-        outputs_use_cache_conf = model(input_ids, attention_mask=ops.ones_like(input_ids))
-        outputs_no_past = model(input_ids, use_cache=False, attention_mask=ops.ones_like(input_ids))
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_bloom_model_attention_mask_past(self, config, input_ids, input_mask, *args):
-        model = BloomModel(config=config)
-        model.eval()
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-        half_seq_length = self.seq_length // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_bloom_model_past_large_inputs(self, config, input_ids, input_mask, *args):
-        model = BloomModel(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=input_mask, use_cache=True)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past)[
-            "last_hidden_state"
-        ]
-        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        print(output_from_past_slice, output_from_no_past_slice)
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_lm_head_model(self, config, input_ids, input_mask, *args):
-        model = BloomForCausalLM(config)
-        model.eval()
-
-        result = model(input_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_sequence_classification_model(self, config, input_ids, input_mask, *args):
-        config.num_labels = self.num_labels
-        model = BloomForSequenceClassification(config)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_token_classification_model(self, config, input_ids, input_mask, *args):
-        model = BloomForTokenClassification(config)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_question_answering_model(self, config, input_ids, input_mask, *args):
-        model = BloomForQuestionAnswering(config)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_forward_and_backwards(
-        self, config, input_ids, input_mask, *args, gradient_checkpointing=False
-    ):
-        model = BloomForCausalLM(config)
-        if gradient_checkpointing:
-            model.gradient_checkpointing_enable()
-
-        def forward(input_ids):
-            result = model(input_ids, labels=input_ids)
-            loss = result.loss
-            logits = result.logits
-            return loss, logits
-
-        grad_fn = mindspore.value_and_grad(forward, None, tuple(model.parameters()))
-        (loss, logits), grad = grad_fn(input_ids)
-        self.parent.assertEqual(loss.shape, ())
-        self.parent.assertEqual(logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_bloom_weight_initialization(self, config, *args):
-        model = BloomModel(config)
-        model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer)
-        for key in model.state_dict().keys():
-            if "c_proj" in key and "weight" in key:
-                self.parent.assertLessEqual(abs(ops.std(model.state_dict()[key]) - model_std), 0.001)
-                self.parent.assertLessEqual(abs(ops.mean(model.state_dict()[key]) - 0.0), 0.01)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        config, input_ids, input_mask, sequence_labels = config_and_inputs
-
-        inputs_dict = {"input_ids": input_ids}
-
-        return config, inputs_dict
-
-
-@require_mindspore
-class BloomModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            BloomModel,
-            BloomForCausalLM,
-            BloomForSequenceClassification,
-            BloomForTokenClassification,
-            BloomForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-
-    all_generative_model_classes = (BloomForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": BloomModel,
-            "question-answering": BloomForQuestionAnswering,
-            "text-classification": BloomForSequenceClassification,
-            "text-generation": BloomForCausalLM,
-            "token-classification": BloomForTokenClassification,
-            "zero-shot": BloomForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = True
-    test_missing_keys = False
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = BloomModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BloomConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_bloom_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bloom_model(*config_and_inputs)
-
-    def test_bloom_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bloom_model_past(*config_and_inputs)
-
-    def test_bloom_model_att_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bloom_model_attention_mask_past(*config_and_inputs)
-
-    @unittest.skip
-    def test_bloom_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bloom_model_past_large_inputs(*config_and_inputs)
-
-    def test_bloom_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    def test_bloom_sequence_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_sequence_classification_model(*config_and_inputs)
-
-    def test_bloom_token_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_token_classification_model(*config_and_inputs)
-
-    @unittest.skip
-    def test_bloom_gradient_checkpointing(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
-
-    def test_bloom_weight_initialization(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_bloom_weight_initialization(*config_and_inputs)
-
-    @unittest.skip(reason="Bloom has a non-standard KV cache format.")
-    def test_past_key_values_format(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "bigscience/bigscience-small-testing"
-        model = BloomModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    @require_mindspore
-    def test_simple_generation(self):
-        # This test is a bit flaky. For some GPU architectures, pytorch sets by default allow_fp16_reduced_precision_reduction = True and some operations
-        # do not give the same results under this configuration, especially torch.baddmm and torch.bmm. https://pytorch.org/docs/stable/notes/numerical_accuracy.html#fp16-on-mi200
-        # As we leave the default value (True) for allow_fp16_reduced_precision_reduction , the tests failed when running in half-precision with smaller models (560m)
-        # Please see: https://pytorch.org/docs/stable/notes/cuda.html#reduced-precision-reduction-in-fp16-gemms
-        # This discrepancy is observed only when using small models and seems to be stable for larger models.
-        # Our conclusion is that these operations are flaky for small inputs but seems to be stable for larger inputs (for the functions `baddmm` and `bmm`), and therefore for larger models.
-
-        # Here is a summary of an ablation study of our observations
-        # EXPECTED_OUTPUT = "I enjoy walking with my cute dog, and I love to watch the kids play. I am a very active person, and I am a very good listener. I am a very good person, and I am a very good person. I am a"
-        # 560m + allow_fp16_reduced_precision_reduction = False  + torch.bmm  ==> PASS
-        # 560m + allow_fp16_reduced_precision_reduction = False  + torch.baddm  ==> PASS
-        # 560m + allow_fp16_reduced_precision_reduction = True  + torch.baddm  ==> PASS
-        # 560m + allow_fp16_reduced_precision_reduction = True  + torch.bmm  ==> FAIL
-
-        # EXPECTED_OUTPUT = "I enjoy walking with my cute dog, but I also enjoy hiking, biking, and swimming. I love to cook and bake. I love to cook and bake. I love to cook and bake. I love to cook and bake. I love"
-        # >=1b1 + allow_fp16_reduced_precision_reduction = True  + torch.baddm  ==> PASS  (for use_cache=True and use_cache=False)
-        # >=1b1 + allow_fp16_reduced_precision_reduction = True  + torch.bmm  ==> PASS
-        # >=1b1 + allow_fp16_reduced_precision_reduction = False  + torch.bmm  ==> PASS
-
-        path_560m = "bigscience/bloom-560m"
-        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750")
-        model = model.eval()
-        tokenizer = BloomTokenizerFast.from_pretrained(path_560m)
-
-        input_sentence = "I enjoy walking with my cute dog"
-        # This output has been obtained using fp32 model on the huggingface DGX workstation - NVIDIA A100 GPU
-        EXPECTED_OUTPUT = (
-            "I enjoy walking with my cute dog, and I love to watch the kids play with the kids. I am a very "
-            "active person, and I enjoy working out, and I am a very active person. I am a very active person, and I"
-        )
-
-        input_ids = tokenizer.encode(input_sentence, return_tensors="ms")
-        greedy_output = model.generate(input_ids, max_length=50)
-        print(greedy_output[0])
-        self.assertEqual(tokenizer.decode(greedy_output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
-
-    @slow
-    @require_mindspore
-    def test_batch_generation(self):
-        path_560m = "bigscience/bloom-560m"
-        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750")
-        model = model.eval()
-        tokenizer = BloomTokenizerFast.from_pretrained(path_560m, padding_side="left")
-
-        input_sentence = ["I enjoy walking with my cute dog", "I enjoy walking with my cute dog"]
-
-        inputs = tokenizer.batch_encode_plus(input_sentence, return_tensors="ms", padding=True)
-        input_ids = inputs["input_ids"]
-        attention_mask = inputs["attention_mask"]
-        greedy_output = model.generate(input_ids, attention_mask=attention_mask, max_length=50, do_sample=False)
-
-        self.assertEqual(
-            tokenizer.decode(greedy_output[0], skip_special_tokens=True),
-            tokenizer.decode(greedy_output[1], skip_special_tokens=True),
-        )
-
-    @slow
-    @require_mindspore
-    def test_batch_generation_padd(self):
-        path_560m = "bigscience/bloom-560m"
-        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750")
-        model = model.eval()
-        tokenizer = BloomTokenizerFast.from_pretrained(path_560m, padding_side="left")
-
-        input_sentence = ["I enjoy walking with my cute dog", "Hello my name is"]
-        input_sentence_without_pad = "Hello my name is"
-
-        input_ids = tokenizer.batch_encode_plus(input_sentence, return_tensors="ms", padding=True)
-        input_ids_without_pad = tokenizer.encode(input_sentence_without_pad, return_tensors="ms")
-
-        input_ids, attention_mask = input_ids["input_ids"], input_ids["attention_mask"]
-        greedy_output = model.generate(input_ids, attention_mask=attention_mask, max_length=50, do_sample=False)
-        greedy_output_without_pad = model.generate(
-            input_ids_without_pad, max_length=50, do_sample=False
-        )
-
-        # test token values
-        self.assertEqual(greedy_output[-1, 3:].tolist(), greedy_output_without_pad[0, :-3].tolist())
-
-        # test reconstructions
-        self.assertEqual(
-            tokenizer.decode(greedy_output[-1, 3:], skip_special_tokens=True),
-            tokenizer.decode(greedy_output_without_pad[0, :-3], skip_special_tokens=True),
-        )
-
-    @slow
-    @require_mindspore
-    def test_batch_generated_text(self):
-        path_560m = "bigscience/bloom-560m"
-
-        model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750")
-        model = model.eval()
-        tokenizer = BloomTokenizerFast.from_pretrained(path_560m, padding_side="left")
-
-        input_sentences = [
-            "Hello what is",
-            "Running a quick test with the",
-        ]
-        inputs = tokenizer(input_sentences, return_tensors="ms", padding=True, truncation=True)
-        generated_ids = model.generate(
-            inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=20
-        )
-        generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        # these generations match those of the PyTorch model
-        EXPECTED_GENERATIONS = [
-            "Hello what is the best way to get the data from the server? I have tried",
-            "Running a quick test with the following command:\nsudo apt-get install python3\nsudo apt-get install python2",
-        ]
-
-        self.assertListEqual(generated_text, EXPECTED_GENERATIONS)
-
-
-@require_mindspore
-class BloomEmbeddingTest(unittest.TestCase):
-    """
-    The goal here is to compare the embeddings generated by the model trained
-    using Megatron-LM with the one from the transformers library, with a small GPT2-like model
-    to ensure that the conversion from Megatron-LM to transformers has been done successfully.
-    The script compares the logits of the embedding layer and the transformer layers.
-
-    WARNING: It is expected that these logits will not have exactly the same statistics when running
-    the code on CPU or GPU. For more info, please visit:
-      - https://github.com/pytorch/pytorch/issues/76052#issuecomment-1103193548
-      - https://discuss.pytorch.org/t/reproducibility-issue-between-intel-and-amd-cpus/144779/9
-
-
-    You need to install tokenizers following this readme:
-        - https://huggingface.co/bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
-
-    Tokenizer used during training:
-        - https://huggingface.co/bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
-
-    # TODO change the script (or just add skip) when building the env with tokenizers 0.12.0
-    """
-
-    def setUp(self):
-        super().setUp()
-        self.path_bigscience_model = "bigscience/bigscience-small-testing"
-
-    @require_mindspore
-    def test_embeddings(self):
-        # The config in this checkpoint has `bfloat16` as `ms_dtype` -> model in `bfloat16`
-        model = BloomForCausalLM.from_pretrained(self.path_bigscience_model, ms_dtype=mindspore.float16)
-        model.eval()
-
-        EMBEDDINGS_DS_BEFORE_LN_BF_16_MEAN = {
-            3478: 0.0002307891845703125,
-            368: -0.000568389892578125,
-            109586: -0.0003910064697265625,
-            35433: -0.000194549560546875,
-            2: 0.0004138946533203125,
-            77: 0.000659942626953125,
-            132619: -0.00031280517578125,
-            2175: 0.000457763671875,
-            23714: 0.000263214111328125,
-            73173: -0.000286102294921875,
-            144252: 0.00052642822265625,
-        }
-        EMBEDDINGS_DS_BEFORE_LN_BF_16_MIN = {
-            3478: -0.00921630859375,
-            368: -0.010009765625,
-            109586: -0.01031494140625,
-            35433: -0.01177978515625,
-            2: -0.0074462890625,
-            77: -0.00848388671875,
-            132619: -0.009521484375,
-            2175: -0.0074462890625,
-            23714: -0.0145263671875,
-            73173: -0.007415771484375,
-            144252: -0.01007080078125,
-        }
-        EMBEDDINGS_DS_BEFORE_LN_BF_16_MAX = {
-            3478: 0.0128173828125,
-            368: 0.01214599609375,
-            109586: 0.0111083984375,
-            35433: 0.01019287109375,
-            2: 0.0157470703125,
-            77: 0.0174560546875,
-            132619: 0.0078125,
-            2175: 0.0113525390625,
-            23714: 0.0146484375,
-            73173: 0.01116943359375,
-            144252: 0.01141357421875,
-        }
-        EMBEDDINGS_DS_BEFORE_LN_BF_16_SUM = {"value": 0.08203125}
-
-        EMBEDDINGS_DS_BEFORE_LN_F_16_MEAN = {
-            132619: -0.00031256675720214844,
-            3478: 0.00023090839385986328,
-            368: -0.0005702972412109375,
-            109586: -0.00039124488830566406,
-            35433: -0.000194549560546875,
-            2: 0.0004146099090576172,
-            2175: 0.0004572868347167969,
-            23714: 0.00026416778564453125,
-            73173: -0.0002865791320800781,
-            144252: 0.0005254745483398438,
-            77: 0.0006618499755859375,
-        }
-        EMBEDDINGS_DS_BEFORE_LN_F_16_MIN = {
-            3478: -0.00921630859375,
-            368: -0.010009765625,
-            109586: -0.01031494140625,
-            35433: -0.01177978515625,
-            2: -0.0074462890625,
-            77: -0.00848388671875,
-            132619: -0.009521484375,
-            2175: -0.0074462890625,
-            23714: -0.0145263671875,
-            73173: -0.007415771484375,
-            144252: -0.01007080078125,
-        }
-        EMBEDDINGS_DS_BEFORE_LN_F_16_MAX = {
-            3478: 0.0128173828125,
-            368: 0.01214599609375,
-            109586: 0.0111083984375,
-            35433: 0.01019287109375,
-            2: 0.0157470703125,
-            77: 0.0174560546875,
-            132619: 0.0078125,
-            2175: 0.0113525390625,
-            23714: 0.0146484375,
-            73173: 0.01116943359375,
-            144252: 0.01141357421875,
-        }
-        EMBEDDINGS_DS_BEFORE_LN_F_16_SUM = {"value": 0.0821533203125}
-
-        EMBEDDINGS_DS_BEFORE_LN_F_32_MEAN = {
-            132619: -0.00031267106533050537,
-            3478: 0.00023087859153747559,
-            368: -0.0005701072514057159,
-            109586: -0.0003911703824996948,
-            35433: -0.0001944899559020996,
-            2: 0.0004146844148635864,
-            2175: 0.00045740045607089996,
-            23714: 0.0002641640603542328,
-            73173: -0.0002864748239517212,
-            144252: 0.0005256589502096176,
-            77: 0.0006617321632802486,
-        }
-        EMBEDDINGS_DS_BEFORE_LN_F_32_MIN = {
-            3478: -0.00921630859375,
-            368: -0.010009765625,
-            109586: -0.01031494140625,
-            35433: -0.01177978515625,
-            2: -0.0074462890625,
-            77: -0.00848388671875,
-            132619: -0.009521484375,
-            2175: -0.0074462890625,
-            23714: -0.0145263671875,
-            73173: -0.007415771484375,
-            144252: -0.01007080078125,
-        }
-        EMBEDDINGS_DS_BEFORE_LN_F_32_MAX = {
-            3478: 0.0128173828125,
-            368: 0.01214599609375,
-            109586: 0.0111083984375,
-            35433: 0.01019287109375,
-            2: 0.0157470703125,
-            77: 0.0174560546875,
-            132619: 0.0078125,
-            2175: 0.0113525390625,
-            23714: 0.0146484375,
-            73173: 0.01116943359375,
-            144252: 0.01141357421875,
-        }
-        EMBEDDINGS_DS_BEFORE_LN_F_32_SUM = {"value": 0.08217757940292358}
-
-        TEST_EMBEDDINGS = {
-            "bfloat16": {
-                "mean": EMBEDDINGS_DS_BEFORE_LN_BF_16_MEAN,
-                "max": EMBEDDINGS_DS_BEFORE_LN_BF_16_MAX,
-                "min": EMBEDDINGS_DS_BEFORE_LN_BF_16_MIN,
-                "sum": EMBEDDINGS_DS_BEFORE_LN_BF_16_SUM,
-            },
-            "float32": {
-                "mean": EMBEDDINGS_DS_BEFORE_LN_F_32_MEAN,
-                "max": EMBEDDINGS_DS_BEFORE_LN_F_32_MAX,
-                "min": EMBEDDINGS_DS_BEFORE_LN_F_32_MIN,
-                "sum": EMBEDDINGS_DS_BEFORE_LN_F_32_SUM,
-            },
-            "float": {
-                "mean": EMBEDDINGS_DS_BEFORE_LN_F_32_MEAN,
-                "max": EMBEDDINGS_DS_BEFORE_LN_F_32_MAX,
-                "min": EMBEDDINGS_DS_BEFORE_LN_F_32_MIN,
-                "sum": EMBEDDINGS_DS_BEFORE_LN_F_32_SUM,
-            },
-            "float16": {
-                "mean": EMBEDDINGS_DS_BEFORE_LN_F_16_MEAN,
-                "max": EMBEDDINGS_DS_BEFORE_LN_F_16_MAX,
-                "min": EMBEDDINGS_DS_BEFORE_LN_F_16_MIN,
-                "sum": EMBEDDINGS_DS_BEFORE_LN_F_16_SUM,
-            },
-        }
-
-        EXAMPLE_IDS = [3478, 368, 109586, 35433, 2, 77, 132619, 3478, 368, 109586, 35433, 2, 2175, 23714, 73173, 144252, 2, 77, 132619, 3478]  # fmt: skip
-
-        EMBEDDINGS_DS_AFTER_LN_MEAN = {
-            3478: -6.580352783203125e-05,
-            368: 0.0001316070556640625,
-            109586: -0.00030517578125,
-            35433: 4.00543212890625e-05,
-            2: -7.2479248046875e-05,
-            77: -8.96453857421875e-05,
-            132619: 0.0001583099365234375,
-            2175: 2.1219253540039062e-05,
-            23714: -0.000247955322265625,
-            73173: -0.00021839141845703125,
-            144252: -0.0001430511474609375,
-        }
-        EMBEDDINGS_DS_AFTER_LN_MIN = {
-            3478: -1.6953125,
-            368: -1.6875,
-            109586: -1.6875,
-            35433: -2.125,
-            2: -1.390625,
-            77: -1.5390625,
-            132619: -1.875,
-            2175: -1.4609375,
-            23714: -2.296875,
-            73173: -1.3515625,
-            144252: -1.78125,
-        }
-        EMBEDDINGS_DS_AFTER_LN_MAX = {
-            3478: 2.265625,
-            368: 2.28125,
-            109586: 1.953125,
-            35433: 1.90625,
-            2: 2.703125,
-            77: 2.828125,
-            132619: 1.65625,
-            2175: 2.015625,
-            23714: 2.234375,
-            73173: 2.171875,
-            144252: 1.828125,
-        }
-
-        EMBEDDINGS_DS_AFTER_LN = {
-            "mean": EMBEDDINGS_DS_AFTER_LN_MEAN,
-            "min": EMBEDDINGS_DS_AFTER_LN_MIN,
-            "max": EMBEDDINGS_DS_AFTER_LN_MAX,
-        }
-
-        tensor_ids = mindspore.Tensor([EXAMPLE_IDS])
-        with no_grad():
-            embeddings = model.transformer.word_embeddings(tensor_ids)
-            embeddings_ln = model.transformer.word_embeddings_layernorm(embeddings)  #
-        # first check the embeddings before LN
-        output_dict = {"min": {}, "max": {}, "mean": {}, "sum": {"value": embeddings.sum().item()}}
-        for i, idx in enumerate(EXAMPLE_IDS):
-            output_dict["min"][idx] = ops.min(embeddings, dim=-1)[0][0][i].item()
-            output_dict["max"][idx] = ops.max(embeddings, dim=-1)[0][0][i].item()
-            output_dict["mean"][idx] = ops.mean(embeddings, dim=-1)[0][i].item()
-
-        for key in TEST_EMBEDDINGS[str(model.dtype).lower()].keys():
-            self.assertDictEqual(TEST_EMBEDDINGS[str(model.dtype).lower()][key], output_dict[key])
-
-        output_dict_norm = {"min": {}, "max": {}, "mean": {}}
-        for i, idx in enumerate(EXAMPLE_IDS):
-            output_dict_norm["min"][idx] = ops.min(embeddings_ln, dim=-1)[0][0][i].item()
-            output_dict_norm["max"][idx] = ops.max(embeddings_ln, dim=-1)[0][0][i].item()
-            output_dict_norm["mean"][idx] = ops.mean(embeddings_ln, dim=-1)[0][i].item()
-
-        # This test does not pass when places = 2
-        for i, key in enumerate(output_dict_norm.keys()):
-            for j, idx in enumerate(output_dict[key].keys()):
-                self.assertAlmostEqual(EMBEDDINGS_DS_AFTER_LN[key][idx], output_dict_norm[key][idx], places=1)
-
-    @unittest.skip
-    @require_mindspore
-    def test_hidden_states_transformers(self):
-        model = BloomModel.from_pretrained(self.path_bigscience_model, use_cache=False, ms_dtype=mindspore.float16)
-        model.eval()
-
-        EXAMPLE_IDS = [3478, 368, 109586, 35433, 2, 77, 132619, 3478, 368, 109586, 35433, 2, 2175, 23714, 73173, 144252, 2, 77, 132619, 3478]  # fmt: skip
-
-        MEAN_VALUE_LAST_LM = -4.3392181396484375e-05
-        MIN_MAX_DICT = {"min": -2.0625, "max": 2.75}
-        tensor_ids = mindspore.Tensor([EXAMPLE_IDS])
-
-        with no_grad():
-            logits = model(tensor_ids)
-        output_dict = {
-            "min": ops.min(logits.last_hidden_state, dim=-1)[0][0][0].item(),
-            "max": ops.max(logits.last_hidden_state, dim=-1)[0][0][0].item(),
-        }
-
-        if mindspore.get_context('device_target') == 'GPU':
-            self.assertAlmostEqual(MEAN_VALUE_LAST_LM, logits.last_hidden_state.mean().item(), places=4)
-        else:
-            self.assertAlmostEqual(MEAN_VALUE_LAST_LM, logits.last_hidden_state.mean().item(), places=3)
-
-        self.assertDictEqual(MIN_MAX_DICT, output_dict)
-
-    @require_mindspore
-    def test_logits(self):
-        model = BloomForCausalLM.from_pretrained(self.path_bigscience_model, use_cache=False, ms_dtype=mindspore.float16)
-        model.eval()
-
-        EXAMPLE_IDS = [3478, 368, 109586, 35433, 2, 77, 132619, 3478, 368, 109586, 35433, 2, 2175, 23714, 73173, 144252, 2, 77, 132619, 3478]  # fmt: skip
-
-        MEAN_LOGITS_GPU_1 = -1.823902130126953e-05
-        MEAN_LOGITS_GPU_2 = 1.9431114196777344e-05
-
-        tensor_ids = mindspore.Tensor([EXAMPLE_IDS])
-        with no_grad():
-            output = model(tensor_ids).logits
-
-        output_gpu_1, output_gpu_2 = ops.split(output, 125440, dim=-1)
-        if mindspore.get_context('device_target') == 'GPU':
-            self.assertAlmostEqual(output_gpu_1.mean().item(), MEAN_LOGITS_GPU_1, places=6)
-            self.assertAlmostEqual(output_gpu_2.mean().item(), MEAN_LOGITS_GPU_2, places=6)
-        else:
-            self.assertAlmostEqual(output_gpu_1.mean().item(), MEAN_LOGITS_GPU_1, places=6)  # 1e-06 precision!!
-            self.assertAlmostEqual(output_gpu_2.mean().item(), MEAN_LOGITS_GPU_2, places=6)
diff --git a/tests/transformers/models/bridgetower/__init__.py b/tests/transformers/models/bridgetower/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/bridgetower/test_modeling_bridgetower.py b/tests/transformers/models/bridgetower/test_modeling_bridgetower.py
deleted file mode 100644
index 8c29bce71..000000000
--- a/tests/transformers/models/bridgetower/test_modeling_bridgetower.py
+++ /dev/null
@@ -1,592 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch BridgeTower model. """
-
-import tempfile
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import (
-    BridgeTowerConfig,
-    BridgeTowerTextConfig,
-    BridgeTowerVisionConfig,
-)
-from mindnlp.utils.import_utils import is_vision_available, is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        BridgeTowerForContrastiveLearning,
-        BridgeTowerForImageAndTextRetrieval,
-        BridgeTowerForMaskedLM,
-        BridgeTowerModel,
-    )
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import BridgeTowerProcessor
-
-
-class BridgeTowerTextModelTester:
-    def __init__(
-        self,
-        parent,
-        hidden_act="gelu",
-        hidden_size=64,
-        initializer_factor=1,
-        layer_norm_eps=1e-05,
-        num_attention_heads=4,
-        num_hidden_layers=2,
-        intermediate_size=128,
-        tie_word_embeddings=False,
-        output_hidden_states=False,
-    ):
-        self.parent = parent
-        self.hidden_act = hidden_act
-        self.hidden_size = hidden_size
-        self.initializer_factor = initializer_factor
-        self.layer_norm_eps = layer_norm_eps
-        self.num_attention_heads = num_attention_heads
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.tie_word_embeddings = tie_word_embeddings
-        self.vocab_size = 99
-        self.seq_length = 4
-        self.batch_size = 1
-        self.is_training = False
-        self.output_hidden_states = output_hidden_states
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask
-
-    def get_config(self):
-        return BridgeTowerTextConfig(
-            hidden_act=self.hidden_act,
-            hidden_size=self.hidden_size,
-            initializer_factor=self.initializer_factor,
-            layer_norm_eps=self.layer_norm_eps,
-            num_attention_heads=self.num_attention_heads,
-            num_hidden_layers=self.num_hidden_layers,
-            intermediate_size=self.intermediate_size,
-            tie_word_embeddings=self.tie_word_embeddings,
-            output_hidden_states=self.output_hidden_states,
-            vocab_size=self.vocab_size,
-        )
-
-
-class BridgeTowerImageModelTester:
-    def __init__(
-        self,
-        parent,
-        hidden_size=64,
-        initializer_factor=1,
-        layer_norm_eps=1e-05,
-        num_hidden_layers=2,
-        init_layernorm_from_vision_encoder=False,
-        output_hidden_states=False,
-        image_size=64,
-    ):
-        self.parent = parent
-        self.hidden_size = hidden_size
-        self.initializer_factor = initializer_factor
-        self.layer_norm_eps = layer_norm_eps
-        self.num_hidden_layers = num_hidden_layers
-        self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder
-        self.num_channels = 3
-        self.num_image_features = 17
-        self.batch_size = 1
-        self.image_size = image_size
-        self.is_training = False
-        self.output_hidden_states = output_hidden_states
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        pixel_mask = random_attention_mask([self.batch_size, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values, pixel_mask
-
-    def get_config(self):
-        return BridgeTowerVisionConfig(
-            hidden_size=self.hidden_size,
-            initializer_factor=self.initializer_factor,
-            layer_norm_eps=self.layer_norm_eps,
-            num_hidden_layers=self.num_hidden_layers,
-            init_layernorm_from_vision_encoder=self.init_layernorm_from_vision_encoder,
-            num_channels=self.num_channels,
-            num_image_features=self.num_image_features,
-            batch_size=self.batch_size,
-            image_size=self.image_size,
-            is_training=self.is_training,
-            output_hidden_states=self.output_hidden_states,
-        )
-
-
-class BridgeTowerModelTester:
-    def __init__(
-        self,
-        parent,
-        text_kwargs=None,
-        vision_kwargs=None,
-        share_cross_modal_transformer_layers=True,
-        share_link_tower_layers=False,
-        link_tower_type="add",
-        init_layernorm_from_vision_encoder=False,
-        contrastive_hidden_size=512,
-        logit_scale_init_value=2.6592,
-        hidden_size=64,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=128,
-    ):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = BridgeTowerTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = BridgeTowerImageModelTester(parent, **vision_kwargs)
-
-        self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers
-        self.share_link_tower_layers = share_link_tower_layers
-        self.link_tower_type = link_tower_type
-        self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder
-        self.contrastive_hidden_size = contrastive_hidden_size
-        self.logit_scale_init_value = logit_scale_init_value
-
-        self.batch_size = 1
-        self.expected_num_hidden_layers = 8
-        self.is_training = False
-
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values, pixel_mask = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return (config, input_ids, attention_mask, pixel_values, pixel_mask)
-
-    def get_config(self):
-        return BridgeTowerConfig.from_text_vision_configs(
-            text_config=self.text_model_tester.get_config(),
-            vision_config=self.vision_model_tester.get_config(),
-            share_cross_modal_transformer_layers=self.share_cross_modal_transformer_layers,
-            share_link_tower_layers=self.share_link_tower_layers,
-            link_tower_type=self.link_tower_type,
-            init_layernorm_from_vision_encoder=self.init_layernorm_from_vision_encoder,
-            contrastive_hidden_size=self.contrastive_hidden_size,
-            logit_scale_init_value=self.logit_scale_init_value,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        pixel_values,
-        pixel_mask,
-    ):
-        model = BridgeTowerModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
-        self.parent.assertEqual(
-            result["text_features"].shape,
-            (self.batch_size, self.text_model_tester.seq_length, self.text_model_tester.hidden_size),
-        )
-        self.parent.assertEqual(
-            result["image_features"].shape,
-            (self.batch_size, self.vision_model_tester.num_image_features, self.vision_model_tester.hidden_size),
-        )
-        self.parent.assertEqual(
-            result["pooler_output"].shape,
-            (self.batch_size, self.text_model_tester.hidden_size + self.vision_model_tester.hidden_size),
-        )
-
-    def create_and_check_for_image_and_text_retrieval(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        pixel_values,
-        pixel_mask,
-    ):
-        bridgetower_itm_output_last_dimension = 2
-
-        model = BridgeTowerForImageAndTextRetrieval(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, bridgetower_itm_output_last_dimension))
-
-    def create_and_check_for_masked_language_modeling(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        pixel_values,
-        pixel_mask,
-    ):
-        model = BridgeTowerForMaskedLM(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
-
-        self.parent.assertEqual(
-            result.logits.shape,
-            (self.batch_size, self.text_model_tester.seq_length, self.text_model_tester.vocab_size),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, input_ids, attention_mask, pixel_values, pixel_mask) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "pixel_mask": pixel_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class BridgeTowerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            BridgeTowerModel,
-            BridgeTowerForImageAndTextRetrieval,
-            BridgeTowerForMaskedLM,
-            BridgeTowerForContrastiveLearning,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = {"feature-extraction": BridgeTowerModel} if is_mindspore_available() else {}
-
-    is_training = False
-    test_headmasking = False
-    test_pruning = False
-    test_torchscript = False
-    test_resize_embeddings = False
-    has_attentions = False
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_cpu_offload(self):
-        pass
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_disk_offload(self):
-        pass
-
-    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
-    def test_model_parallelism(self):
-        pass
-
-    # function to extract meaningful tensor from output per different model_class
-    def extract_output(self, outputs, model_class):
-        return outputs["pooler_output"] if model_class == "BridgeTowerModel" else outputs["logits"]
-
-    def setUp(self):
-        self.model_tester = BridgeTowerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BridgeTowerConfig, hidden_size=37, vocab_size=99)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_and_text_retrieval(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_and_text_retrieval(*config_and_inputs)
-
-    def test_for_masked_language_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_language_modeling(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "BridgeTower/bridgetower-base"
-        model = BridgeTowerModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_save_load_fast_init_from_base(self):
-        # Override as it is a slow test on this model
-        super().test_save_load_fast_init_from_base()
-
-    # Override as extracting meaningful tensor from output is different for BridgeTower
-    def test_save_load(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**input_dict)
-
-            out_2 = self.extract_output(outputs, model_class.__name__)
-            out_2 = out_2.asnumpy()
-            out_2[np.isnan(out_2)] = 0
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname)
-                after_outputs = model(**input_dict)
-
-                # Make sure we don't have nans
-                out_1 = self.extract_output(after_outputs, model_class.__name__)
-                out_1 = out_1.asnumpy()
-                out_1[np.isnan(out_1)] = 0
-                max_diff = np.amax(np.abs(out_1 - out_2))
-                self.assertLessEqual(max_diff, 1e-5)
-
-    # Override this as `hidden states output` is different for BridgeTower
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states_text, hidden_states_vision, hidden_states_cross = (
-                outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-            )
-
-            expected_num_layers = self.model_tester.expected_num_hidden_layers
-            self.assertEqual(
-                sum((len(hidden_states_text), len(hidden_states_vision), len(hidden_states_cross))),
-                expected_num_layers,
-            )
-
-            seq_length = self.model_tester.text_model_tester.seq_length
-            num_image_features = self.model_tester.vision_model_tester.num_image_features
-
-            self.assertListEqual(
-                list(hidden_states_text[0].shape[-2:]),
-                [seq_length, self.model_tester.text_model_tester.hidden_size],
-            )
-            self.assertListEqual(
-                list(hidden_states_vision[0].shape),
-                [num_image_features, 1, self.model_tester.vision_model_tester.hidden_size],
-            )
-            self.assertListEqual(
-                list(hidden_states_cross[0][0].shape[-2:]),
-                [seq_length, self.model_tester.text_model_tester.hidden_size],
-            )
-            self.assertListEqual(
-                list(hidden_states_cross[0][1].shape[-2:]),
-                [num_image_features, self.model_tester.vision_model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    # override as the `logit_scale` parameter initilization is different for BRIDGE TOWER
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            config.logit_scale_init_value,
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    @unittest.skip(reason="""Bridge Tower does not have input/output embeddings. So this test is not applicable.""")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="""Bridge Tower does not have input/output embeddings. Thus this test is not applicable.""")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Bridge Tower does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class BridgeTowerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_processor(self):
-        return (
-            BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_image_and_text_retrieval(self):
-        model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
-        model.set_train(False)
-        processor = self.default_processor
-        image = prepare_img()
-        text = "a bunch of cats laying on a tower."
-        inputs = processor(image, text, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 2)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        self.assertTrue(outputs.logits[0, 1].item() > outputs.logits[0, 0].item())
-
-        # verify loss
-        inputs["labels"] = ops.ones(1, dtype=mindspore.int64)
-        inputs = inputs
-        outputs = model(**inputs)
-        self.assertAlmostEqual(outputs.loss.item(), 0.5108, places=4)
-
-    @slow
-    def test_masked_language_modeling(self):
-        model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
-        model.set_train(False)
-        processor = self.default_processor
-        image = prepare_img()
-        text = "a bunch of <mask> laying on a tower."
-        inputs = processor(image, text, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 11, 50265)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        # verify predicted word
-        predicted_id = outputs.logits.argmax(axis=-1).squeeze(0).tolist()[4]
-        self.assertTrue(processor.decode([predicted_id]) == " cats")
-
-        # verify loss
-        inputs["labels"] = inputs["input_ids"].copy()
-        inputs = inputs
-        outputs = model(**inputs)
-        self.assertAlmostEqual(outputs.loss.item(), 5.7373, places=4)
-
-    @slow
-    def test_constrastive_learning(self):
-        model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
-        model.set_train(False)
-        processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
-        image = prepare_img()
-        text = "a bunch of cats laying on a tower."
-        inputs = processor(image, text, padding=True, return_tensors="ms")
-        outputs = model(**inputs, output_hidden_states=True, return_loss=True)
-
-        # verify the logits
-        expected_shape = (1, 3, 512)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-
-@slow
-@require_mindspore
-class BridgeTowerModelTrainingTest(unittest.TestCase):
-    all_training_supported_model_classes = (
-        (BridgeTowerForImageAndTextRetrieval, BridgeTowerForMaskedLM, BridgeTowerForContrastiveLearning)
-        if is_mindspore_available()
-        else ()
-    )
-
-    def setUp(self):
-        self.model_tester = BridgeTowerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BridgeTowerConfig, hidden_size=37, vocab_size=99)
-
-    def _prepare_inputs_for_training(self, model_class):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if model_class == BridgeTowerForMaskedLM:
-            inputs_dict["labels"] = inputs_dict["input_ids"]
-        elif model_class == BridgeTowerForImageAndTextRetrieval:
-            inputs_dict["labels"] = ids_tensor([1], 2)
-        elif model_class == BridgeTowerForContrastiveLearning:
-            inputs_dict["return_loss"] = True
-        return config, inputs_dict
-
-    def _get_non_used_layer_names(self, model_class):
-        non_used_layer_names = ["text_model.pooler"]
-        if model_class == BridgeTowerForMaskedLM:
-            non_used_layer_names = non_used_layer_names + [
-                # This number `1` actually depends on the number of layers in `cross_modal_image_layers` (by minus 1)
-                "cross_modal_image_layers.1",
-                "cross_modal_image_pooler",
-                "cross_modal_text_pooler",
-            ]
-        return non_used_layer_names
-
-    def _is_layer_used(self, model_class, layer_name):
-        non_used_layer_names = self._get_non_used_layer_names(model_class)
-        for non_used_layer_name in non_used_layer_names:
-            if non_used_layer_name in layer_name:
-                return False
-        return True
diff --git a/tests/transformers/models/bros/__init__.py b/tests/transformers/models/bros/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/bros/test_modeling_bros.py b/tests/transformers/models/bros/test_modeling_bros.py
deleted file mode 100644
index 63f9d427b..000000000
--- a/tests/transformers/models/bros/test_modeling_bros.py
+++ /dev/null
@@ -1,432 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Bros model. """
-
-import copy
-import unittest
-
-import numpy as np
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        BrosConfig,
-        BrosForTokenClassification,
-        BrosModel,
-        BrosSpadeEEForTokenClassification,
-        BrosSpadeELForTokenClassification,
-    )
-
-class BrosModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_bbox_first_token_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=64,
-        num_hidden_layers=5,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_bbox_first_token_mask = use_bbox_first_token_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        bbox = ids_tensor([self.batch_size, self.seq_length, 8], 1)
-        # Ensure that bbox is legal
-        for i in range(bbox.shape[0]):
-            for j in range(bbox.shape[1]):
-                if bbox[i, j, 3] < bbox[i, j, 1]:
-                    t = bbox[i, j, 3]
-                    bbox[i, j, 3] = bbox[i, j, 1]
-                    bbox[i, j, 1] = t
-                if bbox[i, j, 2] < bbox[i, j, 0]:
-                    t = bbox[i, j, 2]
-                    bbox[i, j, 2] = bbox[i, j, 0]
-                    bbox[i, j, 0] = t
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        bbox_first_token_mask = None
-        if self.use_bbox_first_token_mask:
-            bbox_first_token_mask = ops.ones((self.batch_size, self.seq_length), dtype=mindspore.bool_)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        token_labels = None
-        if self.use_labels:
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            initial_token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            subsequent_token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            bbox,
-            token_type_ids,
-            input_mask,
-            bbox_first_token_mask,
-            token_labels,
-            initial_token_labels,
-            subsequent_token_labels,
-        )
-
-    def get_config(self):
-        return BrosConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        bbox,
-        token_type_ids,
-        input_mask,
-        bbox_first_token_mask,
-        token_labels,
-        initial_token_labels,
-        subsequent_token_labels,
-    ):
-        model = BrosModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, bbox=bbox, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, bbox=bbox, token_type_ids=token_type_ids)
-        result = model(input_ids, bbox=bbox)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_token_classification(
-        self,
-        config,
-        input_ids,
-        bbox,
-        token_type_ids,
-        input_mask,
-        bbox_first_token_mask,
-        token_labels,
-        initial_token_labels,
-        subsequent_token_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = BrosForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids, bbox=bbox, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_spade_ee_token_classification(
-        self,
-        config,
-        input_ids,
-        bbox,
-        token_type_ids,
-        input_mask,
-        bbox_first_token_mask,
-        token_labels,
-        initial_token_labels,
-        subsequent_token_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = BrosSpadeEEForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            bbox=bbox,
-            attention_mask=input_mask,
-            bbox_first_token_mask=bbox_first_token_mask,
-            token_type_ids=token_type_ids,
-            initial_token_labels=token_labels,
-            subsequent_token_labels=token_labels,
-        )
-        self.parent.assertEqual(result.initial_token_logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-        self.parent.assertEqual(
-            result.subsequent_token_logits.shape, (self.batch_size, self.seq_length, self.seq_length + 1)
-        )
-
-    def create_and_check_for_spade_el_token_classification(
-        self,
-        config,
-        input_ids,
-        bbox,
-        token_type_ids,
-        input_mask,
-        bbox_first_token_mask,
-        token_labels,
-        initial_token_labels,
-        subsequent_token_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = BrosSpadeELForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            bbox=bbox,
-            attention_mask=input_mask,
-            bbox_first_token_mask=bbox_first_token_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.seq_length + 1))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            bbox,
-            token_type_ids,
-            input_mask,
-            bbox_first_token_mask,
-            token_labels,
-            initial_token_labels,
-            subsequent_token_labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "bbox": bbox,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class BrosModelTest(ModelTesterMixin, unittest.TestCase):
-    test_pruning = False
-    test_torchscript = False
-    test_mismatched_shapes = False
-
-    all_model_classes = (
-        (
-            BrosForTokenClassification,
-            BrosSpadeEEForTokenClassification,
-            BrosSpadeELForTokenClassification,
-            BrosModel,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = () if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": BrosModel, "token-classification": BrosForTokenClassification}
-        if is_mindspore_available()
-        else {}
-    )
-
-    # BROS requires `bbox` in the inputs which doesn't fit into the above 2 pipelines' input formats.
-    # see https://github.com/huggingface/transformers/pull/26294
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return True
-
-    def setUp(self):
-        self.model_tester = BrosModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BrosConfig, hidden_size=37)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if return_labels:
-            if model_class.__name__ in ["BrosForTokenClassification", "BrosSpadeELForTokenClassification"]:
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=mindspore.int64,
-                )
-                inputs_dict["bbox_first_token_mask"] = ops.ones(
-                    [self.model_tester.batch_size, self.model_tester.seq_length],
-                    dtype=mindspore.bool_,
-                )
-            elif model_class.__name__ in ["BrosSpadeEEForTokenClassification"]:
-                inputs_dict["initial_token_labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=mindspore.int64,
-                )
-                inputs_dict["subsequent_token_labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=mindspore.int64,
-                )
-                inputs_dict["bbox_first_token_mask"] = ops.ones(
-                    [self.model_tester.batch_size, self.model_tester.seq_length],
-                    dtype=mindspore.bool_,
-                )
-
-        return inputs_dict
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_spade_ee_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_spade_ee_token_classification(*config_and_inputs)
-
-    def test_for_spade_el_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_spade_el_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "jinho8345/bros-base-uncased"
-        model = BrosModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-def prepare_bros_batch_inputs():
-    attention_mask = mindspore.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-
-    bbox = mindspore.tensor(
-        [
-            [
-                [0.0000, 0.0000, 0.0000, 0.0000],
-                [0.5223, 0.5590, 0.5787, 0.5720],
-                [0.5853, 0.5590, 0.6864, 0.5720],
-                [0.5853, 0.5590, 0.6864, 0.5720],
-                [0.1234, 0.5700, 0.2192, 0.5840],
-                [0.2231, 0.5680, 0.2782, 0.5780],
-                [0.2874, 0.5670, 0.3333, 0.5780],
-                [0.3425, 0.5640, 0.4344, 0.5750],
-                [0.0866, 0.7770, 0.1181, 0.7870],
-                [0.1168, 0.7770, 0.1522, 0.7850],
-                [0.1535, 0.7750, 0.1864, 0.7850],
-                [0.1890, 0.7750, 0.2572, 0.7850],
-                [1.0000, 1.0000, 1.0000, 1.0000],
-            ],
-            [
-                [0.0000, 0.0000, 0.0000, 0.0000],
-                [0.4396, 0.6720, 0.4659, 0.6850],
-                [0.4698, 0.6720, 0.4843, 0.6850],
-                [0.1575, 0.6870, 0.2021, 0.6980],
-                [0.2047, 0.6870, 0.2730, 0.7000],
-                [0.1299, 0.7010, 0.1430, 0.7140],
-                [0.1299, 0.7010, 0.1430, 0.7140],
-                [0.1562, 0.7010, 0.2441, 0.7120],
-                [0.1562, 0.7010, 0.2441, 0.7120],
-                [0.2454, 0.7010, 0.3150, 0.7120],
-                [0.3176, 0.7010, 0.3320, 0.7110],
-                [0.3333, 0.7000, 0.4029, 0.7140],
-                [1.0000, 1.0000, 1.0000, 1.0000],
-            ],
-        ]
-    )
-    input_ids = mindspore.tensor(
-        [
-            [101, 1055, 8910, 1012, 5719, 3296, 5366, 3378, 2146, 2846, 10807, 13494, 102],
-            [101, 2112, 1997, 3671, 6364, 1019, 1012, 5057, 1011, 4646, 2030, 2974, 102],
-        ]
-    )
-
-    return input_ids, bbox, attention_mask
-
-
-@require_mindspore
-class BrosModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head(self):
-        model = BrosModel.from_pretrained("jinho8345/bros-base-uncased")
-
-        input_ids, bbox, attention_mask = prepare_bros_batch_inputs()
-
-        outputs = model(
-            input_ids,
-            bbox,
-            attention_mask=attention_mask,
-            return_dict=True,
-        )
-
-        # verify the logits
-        expected_shape = (2, 13, 768)
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[-0.3074, 0.1363, 0.3143], [0.0925, -0.1155, 0.1050], [0.0221, 0.0003, 0.1285]]
-        )
-
-        self.assertTrue(np.allclose(outputs.last_hidden_state[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/camembert/__init__.py b/tests/transformers/models/camembert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/camembert/test_modeling_camembert.py b/tests/transformers/models/camembert/test_modeling_camembert.py
deleted file mode 100644
index bc0a87e91..000000000
--- a/tests/transformers/models/camembert/test_modeling_camembert.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import mindspore
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_sentencepiece, require_tokenizers, require_mindspore, slow
-
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import CamembertModel
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class CamembertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_output_embeds_base_model(self):
-        model = CamembertModel.from_pretrained("almanach/camembert-base")
-
-        input_ids = mindspore.tensor(
-            [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]],
-            dtype=mindspore.int64,
-        )  # J'aime le camembert !
-        with mindspore._no_grad():
-            output = model(input_ids)["last_hidden_state"]
-        expected_shape = (1, 10, 768)
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = mindspore.tensor(
-            [[[-0.0254, 0.0235, 0.1027], [0.0606, -0.1811, -0.0418], [-0.1561, -0.1127, 0.2687]]],
-            dtype=mindspore.float32,
-        )
-        # camembert = torch.hub.load('pytorch/fairseq', 'camembert.v0')
-        # camembert.eval()
-        # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()
-
-        self.assertTrue(np.allclose(output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/canine/__init__.py b/tests/transformers/models/canine/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/canine/test_modeling_canine.py b/tests/transformers/models/canine/test_modeling_canine.py
deleted file mode 100644
index 056d19827..000000000
--- a/tests/transformers/models/canine/test_modeling_canine.py
+++ /dev/null
@@ -1,574 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore CANINE model."""
-
-import unittest
-from typing import List, Tuple
-import numpy as np
-
-from mindnlp.transformers import CanineConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, global_rng, ids_tensor, random_attention_mask
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        CanineForMultipleChoice,
-        CanineForQuestionAnswering,
-        CanineForSequenceClassification,
-        CanineForTokenClassification,
-        CanineModel,
-    )
-
-
-class CanineModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        # let's use a vocab size that's way bigger than BERT's one
-        # NOTE: this is not a model parameter, just an input
-        vocab_size=100000,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        num_hash_buckets=16,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.num_hash_buckets = num_hash_buckets
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor(input_ids.shape, self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return CanineConfig(
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            num_hash_buckets=self.num_hash_buckets,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = CanineModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = CanineForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = CanineForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = CanineForTokenClassification(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = CanineForMultipleChoice(config=config)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class CanineModelTest(ModelTesterMixin, unittest.TestCase): # PipelineTesterMixin
-    all_model_classes = (
-        (
-            CanineModel,
-            CanineForMultipleChoice,
-            CanineForQuestionAnswering,
-            CanineForSequenceClassification,
-            CanineForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": CanineModel,
-            "question-answering": CanineForQuestionAnswering,
-            "text-classification": CanineForSequenceClassification,
-            "token-classification": CanineForTokenClassification,
-            "zero-shot": CanineForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_mismatched_shapes = False
-    test_resize_embeddings = False
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = CanineModelTester(self)
-        # we set has_text_modality to False as the config has no vocab_size attribute
-        self.config_tester = ConfigTester(self, config_class=CanineConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-            # expected_num_layers equals num_hidden_layers of the deep encoder + 1, + 2 for the first shallow encoder, + 2
-            # for the final shallow encoder
-            expected_num_layers = self.model_tester.num_hidden_layers + 1 + 2 + 2
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            seq_length = self.model_tester.seq_length
-            for i in range(expected_num_layers):
-                if (i < 2) or ((expected_num_layers - i) < 3):
-                    # the expected length of the hidden_states of the first and final shallow encoders
-                    # is equal to the seq_length
-                    self.assertListEqual(
-                        list(hidden_states[i].shape[-2:]),
-                        [seq_length, self.model_tester.hidden_size],
-                    )
-                else:
-                    # the expected length of the hidden_states of the deep encoder need to be updated
-                    # for CANINE since the seq length is downsampled
-                    self.assertListEqual(
-                        list(hidden_states[i].shape[-2:]),
-                        [seq_length // config.downsampling_rate, self.model_tester.hidden_size],
-                    )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            # we add + 2 due to the 2 shallow encoders
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers + 2)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            # we add + 2 due to the 2 shallow encoders
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers + 2)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_len, seq_len],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers + 2)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_len, seq_len],
-            )
-
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def set_nan_tensor_to_zero(t):
-            t[t != t] = 0
-            return t
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
-            dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-            def recursive_check(tuple_object, dict_object):
-                if isinstance(tuple_object, (List, Tuple)):
-                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif tuple_object is None:
-                    return
-                else:
-                    self.assertTrue(
-                        ops.allclose(
-                            set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
-                        ),
-                        msg=(
-                            "Tuple and dict output are not equal. Difference:"
-                            f" {ops.max(ops.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                            f" {ops.isnan(tuple_object).any()} and `inf`: {ops.isinf(tuple_object)}. Dict has"
-                            f" `nan`: {ops.isnan(dict_object).any()} and `inf`: {ops.isinf(dict_object)}."
-                        ),
-                    )
-
-            recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(
-                model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
-            )
-
-    @unittest.skip(
-        reason="mindspore do not use loss.backward"
-    )
-    def test_headmasking(self):
-        if not self.test_head_masking:
-            self.skipTest(reason="test_head_masking is set to False")
-
-        global_rng.seed(42)
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        global_rng.seed()
-
-        inputs_dict["output_attentions"] = True
-        config.output_hidden_states = True
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.eval()
-
-            # Prepare head_mask
-            # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
-            head_mask = ops.ones(
-                self.model_tester.num_hidden_layers,
-                self.model_tester.num_attention_heads,
-            )
-            head_mask[0, 0] = 0
-            head_mask[-1, :-1] = 0
-            inputs = self._prepare_for_class(inputs_dict, model_class).copy()
-            inputs["head_mask"] = head_mask
-
-            def forward(head_mask):
-                outputs = model(**inputs, return_dict=True)
-
-                # Test that we can get a gradient back for importance score computation
-                output = sum(t.sum() for t in outputs[0])
-                output = output.sum()
-                return output, outputs
-            grad_fn = mindspore.value_and_grad(forward, 0)
-            (_, outputs), grads = grad_fn(head_mask)
-
-            # output.backward()
-            multihead_outputs = grads[0]
-
-            self.assertIsNotNone(multihead_outputs)
-            self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
-
-            def check_attentions_validity(attentions):
-                # Remove Nan
-                for t in attentions:
-                    self.assertLess(
-                        ops.sum(ops.isnan(t)), t.numel() / 4
-                    )  # Check we don't have more than 25% nans (arbitrary)
-                attentions = [
-                    t.masked_fill(ops.isnan(t), 0.0) for t in attentions
-                ]  # remove them (the test is less complete)
-
-                self.assertAlmostEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(attentions[1][..., -1, :, :].flatten().sum().item(), 0.0)
-                self.assertAlmostEqual(attentions[-2][..., -2, :, :].flatten().sum().item(), 0.0)
-                self.assertNotEqual(attentions[-2][..., -1, :, :].flatten().sum().item(), 0.0)
-
-            check_attentions_validity(outputs.attentions)
-
-    @unittest.skip(reason="CANINE does not have a get_input_embeddings() method.")
-    def test_inputs_embeds(self):
-        # ViT does not use inputs_embeds
-        pass
-
-    @unittest.skip(reason="Canine Tower does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(reason="CANINE does not have a get_input_embeddings() method.")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="CANINE does not have a get_input_embeddings() method.")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(
-        reason="can not run on CPU"
-    )
-    def test_training(self):
-        pass
-
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/canine-s"
-        model = CanineModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class CanineModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head(self):
-        model = CanineModel.from_pretrained("google/canine-s")
-        # this one corresponds to the first example of the TydiQA dev set (in Swahili)
-        # fmt: off
-        input_ids = [57344, 57349, 85, 107, 117, 98, 119, 97, 32, 119, 97, 32, 82, 105, 106, 105, 108, 105, 32, 75, 97, 110, 116, 111, 114, 105, 32, 110, 105, 32, 107, 105, 97, 115, 105, 32, 103, 97, 110, 105, 63, 57345, 57350, 32, 82, 105, 106, 105, 108, 105, 32, 75, 97, 110, 116, 111, 114, 105, 32, 44, 32, 82, 105, 106, 105, 108, 105, 32, 75, 97, 110, 116, 97, 114, 117, 115, 105, 32, 97, 117, 32, 105, 110, 103, 46, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 40, 112, 105, 97, 58, 32, 84, 111, 108, 105, 109, 97, 110, 32, 97, 117, 32, 82, 105, 103, 105, 108, 32, 75, 101, 110, 116, 97, 117, 114, 117, 115, 41, 32, 110, 105, 32, 110, 121, 111, 116, 97, 32, 105, 110, 97, 121, 111, 110, 103, 39, 97, 97, 32, 115, 97, 110, 97, 32, 107, 97, 116, 105, 107, 97, 32, 97, 110, 103, 97, 32, 121, 97, 32, 107, 117, 115, 105, 110, 105, 32, 107, 119, 101, 110, 121, 101, 32, 107, 117, 110, 100, 105, 110, 121, 111, 116, 97, 32, 121, 97, 32, 75, 97, 110, 116, 97, 114, 117, 115, 105, 32, 40, 112, 105, 97, 58, 32, 105, 110, 103, 46, 32, 67, 101, 110, 116, 97, 117, 114, 117, 115, 41, 46, 32, 78, 105, 32, 110, 121, 111, 116, 97, 32, 121, 97, 32, 107, 117, 110, 103, 97, 97, 32, 115, 97, 110, 97, 32, 121, 97, 32, 110, 110, 101, 32, 97, 110, 103, 97, 110, 105, 32, 108, 97, 107, 105, 110, 105, 32, 104, 97, 105, 111, 110, 101, 107, 97, 110, 105, 32, 107, 119, 101, 110, 121, 101, 32, 110, 117, 115, 117, 100, 117, 110, 105, 97, 32, 121, 97, 32, 107, 97, 115, 107, 97, 122, 105, 110, 105, 46, 32, 57351, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 110, 105, 32, 110, 121, 111, 116, 97, 32, 121, 97, 32, 112, 101, 107, 101, 101, 32, 107, 119, 97, 32, 115, 97, 98, 97, 98, 117, 32, 110, 105, 32, 110, 121, 111, 116, 97, 32, 121, 101, 116, 117, 32, 106, 105, 114, 97, 110, 105, 32, 107, 97, 116, 105, 107, 97, 32, 97, 110, 103, 97, 32, 105, 110, 97, 32, 117, 109, 98, 97, 108, 105, 32, 119, 97, 32, 109, 105, 97, 107, 97, 32, 121, 97, 32, 110, 117, 114, 117, 32, 52, 46, 50, 46, 32, 73, 110, 97, 111, 110, 101, 107, 97, 110, 97, 32, 97, 110, 103, 97, 110, 105, 32, 107, 97, 114, 105, 98, 117, 32, 110, 97, 32, 107, 117, 110, 100, 105, 110, 121, 111, 116, 97, 32, 121, 97, 32, 83, 97, 108, 105, 98, 117, 32, 40, 67, 114, 117, 120, 41, 46, 32, 57352, 32, 82, 105, 106, 105, 108, 105, 32, 75, 97, 110, 116, 97, 114, 117, 115, 105, 32, 40, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 41, 32, 105, 110, 97, 111, 110, 101, 107, 97, 110, 97, 32, 107, 97, 109, 97, 32, 110, 121, 111, 116, 97, 32, 109, 111, 106, 97, 32, 108, 97, 107, 105, 110, 105, 32, 107, 119, 97, 32, 100, 97, 114, 117, 98, 105, 110, 105, 32, 107, 117, 98, 119, 97, 32, 105, 110, 97, 111, 110, 101, 107, 97, 110, 97, 32, 107, 117, 119, 97, 32, 109, 102, 117, 109, 111, 32, 119, 97, 32, 110, 121, 111, 116, 97, 32, 116, 97, 116, 117, 32, 122, 105, 110, 97, 122, 111, 107, 97, 97, 32, 107, 97, 114, 105, 98, 117, 32, 110, 97, 32, 107, 117, 115, 104, 105, 107, 97, 109, 97, 110, 97, 32, 107, 97, 116, 105, 32, 121, 97, 111, 46, 32, 78, 121, 111, 116, 97, 32, 109, 97, 112, 97, 99, 104, 97, 32, 122, 97, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 65, 32, 110, 97, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 66, 32, 122, 105, 107, 111, 32, 109, 105, 97, 107, 97, 32, 121, 97, 32, 110, 117, 114, 117, 32, 52, 46, 51, 54, 32, 107, 117, 116, 111, 107, 97, 32, 107, 119, 101, 116, 117, 32, 110, 97, 32, 110, 121, 111, 116, 97, 32, 121, 97, 32, 116, 97, 116, 117, 32, 65, 108, 112, 104, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 67, 32, 97, 117, 32, 80, 114, 111, 120, 105, 109, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 105, 110, 97, 32, 117, 109, 98, 97, 108, 105, 32, 119, 97, 32, 109, 105, 97, 107, 97, 32, 121, 97, 32, 110, 117, 114, 117, 32, 52, 46, 50, 50, 46, 32, 57353, 32, 80, 114, 111, 120, 105, 109, 97, 32, 67, 101, 110, 116, 97, 117, 114, 105, 32, 40, 121, 97, 97, 110, 105, 32, 110, 121, 111, 116, 97, 32, 121, 97, 32, 75, 97, 110, 116, 97, 114, 117, 115, 105, 32, 105, 108, 105, 121, 111, 32, 107, 97, 114, 105, 98, 117, 32, 122, 97, 105, 100, 105, 32, 110, 97, 115, 105, 41, 32, 105, 109, 101, 103, 117, 110, 100, 117, 108, 105, 119, 97, 32, 107, 117, 119, 97, 32, 110, 97, 32, 115, 97, 121, 97, 114, 105, 32, 109, 111, 106, 97, 46, 32, 86, 105, 112, 105, 109, 111, 32, 118, 105, 110, 97, 118, 121, 111, 112, 97, 116, 105, 107, 97, 110, 97, 32, 104, 97, 100, 105, 32, 115, 97, 115, 97, 32, 122, 105, 110, 97, 111, 110, 121, 101, 115, 104, 97, 32, 117, 119, 101, 122, 101, 107, 97, 110, 111, 32, 109, 107, 117, 98, 119, 97, 32, 121, 97, 32, 107, 119, 97, 109, 98, 97, 32, 115, 97, 121, 97, 114, 105, 32, 104, 105, 105, 32, 110, 105, 32, 121, 97, 32, 109, 119, 97, 109, 98, 97, 32, 40, 107, 97, 109, 97, 32, 100, 117, 110, 105, 97, 32, 121, 101, 116, 117, 44, 32, 77, 105, 114, 105, 104, 105, 32, 97, 117, 32, 90, 117, 104, 117, 114, 97, 41, 32, 110, 97, 32, 105, 110, 97, 119, 101, 122, 97, 32, 107, 117, 119, 97, 32, 110, 97, 32, 97, 110, 103, 97, 104, 101, 119, 97, 44, 32, 116, 101, 110, 97, 32, 107, 97, 116, 105, 107, 97, 32, 117, 112, 101, 111, 32, 119, 97, 32, 106, 111, 116, 111, 32, 117, 110, 97, 111, 114, 117, 104, 117, 115, 117, 32, 107, 117, 119, 101, 112, 111, 32, 107, 119, 97, 32, 117, 104, 97, 105, 46, 32, 91, 49, 93, 57345, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-        attention_mask = [1 if x != 0 else 0 for x in input_ids]
-        token_type_ids = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-        # fmt: on
-        input_ids = mindspore.tensor([input_ids])
-        attention_mask = mindspore.tensor([attention_mask])
-        token_type_ids = mindspore.tensor([token_type_ids])
-        outputs = model(input_ids, attention_mask, token_type_ids)
-
-        # verify sequence output
-        expected_shape = (1, 2048, 768)
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [
-                [-0.161433131, 0.395568609, 0.0407391489],
-                [-0.108025983, 0.362060368, -0.544592619],
-                [-0.141537309, 0.180541009, 0.076907],
-            ]
-        )
-
-        self.assertTrue(ops.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-2))
-
-        # verify pooled output
-        expected_shape = (1, 768)
-        self.assertEqual(outputs.pooler_output.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-0.884311497, -0.529064834, 0.723164916])
-
-        self.assertTrue(ops.allclose(outputs.pooler_output[0, :3], expected_slice, atol=1e-2))
\ No newline at end of file
diff --git a/tests/transformers/models/canine/test_tokenization_canine.py b/tests/transformers/models/canine/test_tokenization_canine.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/chatglm/__init__.py b/tests/transformers/models/chatglm/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/chatglm/test_modeling_chatglm.py b/tests/transformers/models/chatglm/test_modeling_chatglm.py
deleted file mode 100644
index 91178004b..000000000
--- a/tests/transformers/models/chatglm/test_modeling_chatglm.py
+++ /dev/null
@@ -1,170 +0,0 @@
-import datetime
-import math
-import unittest
-import mindspore
-import random
-
-from mindnlp.transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-
-def set_random_seed(seed):
-    import random
-
-    random.seed(seed)
-
-    # pytorch RNGs
-    import mindspore
-
-    mindspore.set_seed(seed)
-
-    # numpy RNG
-    import numpy as np
-
-    np.random.seed(seed)
-
-
-
-def ids_tensor(shape, vocab_size):
-    #  Creates a random int32 tensor of the shape within the vocab size
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(random.randint(0, vocab_size - 1))
-
-    return mindspore.tensor(values, dtype=mindspore.int64).view(shape)
-
-
-def get_model_and_tokenizer():
-    model = AutoModelForSeq2SeqLM.from_pretrained("ZhipuAI/ChatGLM-6B", mirror='modelscope').half()
-    model.set_train(False)
-    tokenizer = AutoTokenizer.from_pretrained("ZhipuAI/ChatGLM-6B", mirror='modelscope')
-    return model, tokenizer
-
-def get_model_and_tokenizer_random_init():
-    config = AutoConfig.from_pretrained("ZhipuAI/ChatGLM-6B", mirror='modelscope')
-    model = AutoModelForSeq2SeqLM.from_config(config).half()
-    model.set_train(False)
-    tokenizer = AutoTokenizer.from_pretrained("ZhipuAI/ChatGLM-6B", mirror='modelscope')
-    return model, tokenizer
-
-@require_mindspore
-class ChatGLMGenerationTest(unittest.TestCase):
-    def get_generation_kwargs(self):
-        pass
-
-    @slow
-    def test_chat(self):
-        model, tokenizer = get_model_and_tokenizer()
-        prompts = ["你好", "介绍一下清华大学", "它创建于哪一年"]
-        history = []
-        set_random_seed(42)
-        expected_responses = [
-            '你好👋！我是人工智能助手 ChatGLM-6B，很高兴见到你，欢迎问我任何问题。',
-            '清华大学是中国著名的综合性研究型大学，位于中国北京市海淀区，创建于 1911 年，前身是清华学堂。作为我国顶尖高等教育机构之一，清华大学在科学研究、工程技术、信息技术、经济管理等领域处于领先地位，也是世界上最著名的工程学府之一。\n\n清华大学拥有世界一流的教学设施和科学研究平台，设有多个学院和研究中心，包括工程学院、自然科学学院、社会科学学院、人文学院、法学院、经济管理学院等。学校拥有众多知名教授和研究团队，其中包括多位院士、国家杰出青年科学基金获得者、长江学者等。\n\n清华大学的本科生招生范围为全国中学毕业生，本科生入学要求严格，考试成绩优秀。同时，清华大学也提供研究生和博士生招生，包括硕士研究生和博士研究生。',
-            '清华大学创建于 1911 年。'
-        ]
-        for (prompt, expected_response) in zip(prompts, expected_responses):
-            response, history = model.chat(tokenizer, prompt, history=history)
-            print(repr(response))
-            self.assertEquals(expected_response, response)
-
-    @slow
-    def test_stream_chat(self):
-        model, tokenizer = get_model_and_tokenizer()
-        prompts = ["你好", "介绍一下清华大学", "它创建于哪一年"]
-        history = []
-        expected_responses = [
-            '你好👋！我是人工智能助手 ChatGLM-6B，很高兴见到你，欢迎问我任何问题。',
-            '清华大学是中国著名的综合性研究型大学，位于中国北京市海淀区，创建于 1911 年，前身是清华学堂。作为我国顶尖高等教育机构之一，清华大学在科学研究、工程技术、信息技术、经济管理等领域处于领先地位，也是世界上最著名的工程学府之一。\n\n清华大学拥有世界一流的教学设施和科学研究平台，设有多个学院和研究中心，包括工程学院、自然科学学院、社会科学学院、人文学院、法学院、经济管理学院等。学校拥有众多知名教授和研究团队，其中包括多位院士、国家杰出青年科学基金获得者、长江学者等。\n\n清华大学的本科生招生范围为全国中学毕业生，本科生入学要求严格，考试成绩优秀。同时，清华大学也提供研究生和博士生招生，包括硕士研究生和博士研究生。',
-            '清华大学创建于 1911 年。'
-        ]
-        set_random_seed(42)
-        for prompt, expected_response in zip(prompts, expected_responses):
-            response = ""
-            for idx, (response, history) in enumerate(model.stream_chat(tokenizer, prompt, history=history)):
-                pass
-            print(repr(response))
-            self.assertEquals(expected_response, response)
-
-    @slow
-    def test_generation(self):
-        model, tokenizer = get_model_and_tokenizer()
-        sentence = "晚上睡不着怎么办"
-        parameters = [(False, 2048, 1),
-                      (False, 64, 1),
-                      (True, 2048, 1),
-                      (True, 64, 1),
-                      (True, 2048, 4)]
-        expected_out_sentences = [
-            '晚上睡不着怎么办 以下是一些可能有助于在晚上入睡的方法:\n\n1. 保持规律的睡眠时间表:尽量在同一时间上床,并尝试在早上醒来时自然起床。\n\n2. 创建舒适的睡眠环境:保持房间安静、凉爽、黑暗、舒适,并使用舒适的床垫和枕头。\n\n3. 避免刺激性物质:避免饮用含咖啡因的饮料,如咖啡、茶和可乐,并尽可能减少饮酒。\n\n4. 放松身心:尝试进行放松的活动,如冥想、深呼吸、瑜伽或听轻柔的音乐。\n\n5. 避免在床上做其他事情:例如看电视、使用电脑或智能手机等。\n\n6. 练习放松技巧:例如渐进性肌肉松弛法、冥想或深呼吸练习。\n\n7. 寻求帮助:如果长时间都无法正常入睡,可以考虑咨询医生或专业心理医生,寻求更进一步的帮助。\n\n希望这些方法能有助于入睡。',
-            '晚上睡不着怎么办 以下是一些可能有助于在晚上入睡的方法:\n\n1. 保持规律的睡眠时间表:尽量在同一时间上床,并尝试在早上醒来时自然起床。\n\n2. 创建舒适的睡眠环境:保持房间安静、凉爽、黑暗、舒适,并使用舒适的床垫和枕头。',
-            '晚上睡不着怎么办 以下是一些有助于在晚上更好地入睡的方法:\n\n1. 维持规律的睡眠时间:每晚尽可能在同一时间上床,保持规律的睡眠时间表,帮助身体调整并更容易入睡。\n\n2. 避免在床上使用电子设备:手机、平板电脑、电脑等电子设备会发出蓝光,这会干扰身体释放褪黑素,进而导致难以入睡。建议你在睡前一小时停止使用这些设备。\n\n3. 创建舒适的睡眠环境:确保卧室安静、黑暗、凉爽,舒适的床垫和枕头,保持卧室温度适宜,这有助于让你更容易入睡。\n\n4. 放松身心:尝试进行一些放松的活动,如冥想、深呼吸、瑜伽或轻松的散步,减轻压力和焦虑,让你更容易入睡。\n\n5. 避免咖啡因和酒精:咖啡因和酒精会让大脑更加兴奋,进而干扰身体入睡过程。建议在睡前几小时避免饮用这些物质。\n\n6. 做一些安静的活动:阅读一本书、听轻柔的音乐、绣或者绘画等安静的活动,有助于自己放松身心,进而更容易入睡。\n\n如果采取以上这些方法仍然无法入睡,建议咨询医生或专业的睡眠专家,获取更好的建议和帮助。',
-            '晚上睡不着怎么办 以下是一些有助于在晚上更好地入睡的方法:\n\n1. 维持规律的睡眠时间:每晚尽可能在同一时间上床,保持规律的睡眠时间表,帮助身体调整并更容易入睡。\n\n2. 避免在床上使用电子设备:手机、平板电脑、电脑等电子设备会发出蓝光,这会干扰身体',
-            '晚上睡不着怎么办 以下是一些可能有助于在晚上入睡的方法:\n\n1. 建立规律的睡眠时间表:尽量在同一时间入睡和起床,即使在周末和假期也要尽量保持一致。\n\n2. 创造舒适的睡眠环境:保持房间安静、凉爽、黑暗、舒适,使用舒适的床垫和枕头等。\n\n3. 放松身心:尝试进行一些放松的活动,如冥想、深呼吸、瑜伽、听轻柔的音乐等,缓解压力和紧张情绪。\n\n4. 避免刺激性物质:避免饮用咖啡、茶、可乐等含咖啡因的饮料,避免吸烟和饮酒等刺激性物质。\n\n5. 避免躺在床上翻来覆去:如果躺在床上超过20分钟还不能入睡,就不要躺在床上翻来覆去,而是起床去做一些放松的活动,直到感到困倦为止。\n\n6. 练习放松技巧:如果感到焦虑或紧张,可以尝试进行一些放松技巧,如渐进性肌肉松弛、冥想等。\n\n7. 改善睡眠障碍:如果已经尝试了上述方法仍然无法入睡,可以考虑咨询医生,了解是否存在其他睡眠障碍问题,并接受相应的治疗。']
-        for (do_sample, max_length, num_beams), expected_output_sentence in zip(parameters, expected_out_sentences):
-            set_random_seed(42)
-            inputs = tokenizer(sentence, return_tensors="ms")
-
-            outputs = model.generate(
-                **inputs,
-                do_sample=do_sample,
-                max_length=max_length,
-                num_beams=num_beams
-            )
-
-            outputs = outputs.tolist()[0]
-            out_sentence = tokenizer.decode(outputs, skip_special_tokens=True)
-            print(out_sentence)
-            self.assertEquals(expected_output_sentence, out_sentence)
-
-    @slow
-    def test_batch_generation(self):
-        model, tokenizer = get_model_and_tokenizer()
-        sentences = [
-            "你好",
-            "介绍一下清华大学"
-        ]
-        parameters = [
-                      (False, 2048, 1),
-                      (False, 64, 1),
-                      (True, 2048, 1),
-                      (True, 64, 1),
-                      (True, 2048, 4)]
-        expected_out_sentences = [
-            ['你好 你好👋!我是人工智能助手 ChatGLM-6B,很高兴见到你,欢迎问我任何问题。',
-             '介绍一下清华大学 清华大学是中国著名的综合性大学,位于北京市海淀区双清路30号,其历史可以追溯到1911年创建的清华学堂,1925年更名为清华学校,1937年抗日战争全面爆发后南迁长沙,1946年迁回清华园。新中国成立后,清华学校更名为清华大学。\n\n清华大学是中国最顶尖的大学之一,在工程、科学、技术、经济、管理等领域都有很高的学术声誉和影响力。学校拥有世界一流的教学设施和科学研究平台,有多个学院和研究中心,包括工程学院、自然科学学院、人文学院、社会科学学院、经济管理学院、法学院、美术学院、医学院、器学院等。\n\n清华大学的本科生招生始于2000年,实行全面二孩政策后,本科生招生规模不断扩大。截至2022年,清华大学共有本科生近3万人,研究生近2万人,其中国际学生占比约为10%。清华大学的本科生教育注重通识教育和个性化培养,强调实践、创新、国际化和综合素质。'],
-            [
-                '你好 你好👋!我是人工智能助手 ChatGLM-6B,很高兴见到你,欢迎问我任何问题。',
-                '介绍一下清华大学 清华大学是中国著名的综合性大学,位于北京市海淀区双清路30号,其历史可以追溯到1911年创建的清华学堂,1925年更名为清华学校,1937年抗日战争全面爆发后南迁长沙,1946年迁回'
-            ],
-            [
-                '你好 你好👋!我是人工智能助手 ChatGLM-6B,很高兴见到你,欢迎问我任何问题。',
-                '介绍一下清华大学 清华大学是中国著名的综合性研究型大学,位于北京市海淀区双清路 30 号,其溯源于 1911 年创建的清华学堂, 1925 年更名为清华学校, 1937 年秋抗日战争全面爆发后闭校。1949 年 10 月开学复校,成为我国第一个社会主义大学生活了的高校。截至 2023 年,清华学校共管辖 2 个学院、13 个系,有本科专业 60 个,研究生专业 190 个。'
-            ],
-            [
-                '你好 你好👋!我是人工智能助手 ChatGLM-6B,很高兴见到你,欢迎问我任何问题。',
-                '介绍一下清华大学 清华大学是中国著名的综合性研究型大学,位于北京市海淀区双清路 30 号,其溯源于 1911 年创建的清华学堂, 1925 年更名为清华学校, 1937 年秋抗日战争全面爆发后'
-            ],
-            [
-                '你好 你好👋!我是人工智能助手 ChatGLM-6B,很高兴见到你,欢迎问我任何问题。',
-                '介绍一下清华大学 清华大学是中国著名的综合性研究型大学,位于北京市海淀区双清路30号,其历史可以追溯到1911年创建的清华学堂,1925年更名为清华学校,1937年抗日战争全面爆发后南迁长沙,与北京大学、南开大学组建国立长沙临时大学,1938年迁至 昆明改名为国立西南联合大学,1946年迁回北京。新中国成立后,清华学校更名为清华大学。'
-            ]
-        ]
-        for (do_sample, max_length, num_beams), expected_output_sentence in zip(parameters, expected_out_sentences):
-            set_random_seed(42)
-            inputs = tokenizer(sentences, return_tensors="ms", padding=True)
-
-            outputs = model.generate(
-                **inputs,
-                do_sample=do_sample,
-                max_length=max_length,
-                num_beams=num_beams
-            )
-
-            batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-            print(batch_out_sentence)
-            self.assertListEqual(expected_output_sentence, batch_out_sentence)
diff --git a/tests/transformers/models/chinese_clip/__init__.py b/tests/transformers/models/chinese_clip/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/chinese_clip/test_modeling_chinese_clip.py b/tests/transformers/models/chinese_clip/test_modeling_chinese_clip.py
deleted file mode 100644
index 335aa2fcd..000000000
--- a/tests/transformers/models/chinese_clip/test_modeling_chinese_clip.py
+++ /dev/null
@@ -1,707 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Chinese-CLIP model."""
-
-import inspect
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import requests
-
-from mindnlp.transformers import ChineseCLIPConfig, ChineseCLIPTextConfig, ChineseCLIPVisionConfig
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        ChineseCLIPModel,
-        ChineseCLIPTextModel,
-        ChineseCLIPVisionModel,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import ChineseCLIPProcessor
-
-
-class ChineseCLIPTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        """
-        Returns a tiny configuration by default.
-        """
-        return ChineseCLIPTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ChineseCLIPTextModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = ChineseCLIPTextModel(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-class ChineseCLIPVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return ChineseCLIPVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = ChineseCLIPVisionModel(config=config)
-        model.eval()
-        with no_grad():
-            result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ChineseCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (ChineseCLIPTextModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int64
-                )
-                inputs_dict["next_sentence_label"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = ChineseCLIPTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ChineseCLIPTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "OFA-Sys/chinese-clip-vit-base-patch16"
-        model = ChineseCLIPTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip
-    def test_training(self):
-        pass
-
-    @unittest.skip
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="ChineseCLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="ChineseCLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-
-@require_mindspore
-class ChineseCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as CHINESE_CLIP does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (ChineseCLIPVisionModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = ChineseCLIPVisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=ChineseCLIPVisionConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="CHINESE_CLIP does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip
-    def test_training(self):
-        pass
-
-    @unittest.skip
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="ChineseCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="ChineseCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "OFA-Sys/chinese-clip-vit-base-patch16"
-        model = ChineseCLIPVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class ChineseCLIPModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = ChineseCLIPTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = ChineseCLIPVisionModelTester(parent, **vision_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            attention_mask,
-            _,
-            __,
-            ___,
-        ) = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return ChineseCLIPConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, token_type_ids, attention_mask, pixel_values):
-        model = ChineseCLIPModel(config).eval()
-        with no_grad():
-            result = model(input_ids, pixel_values, attention_mask, token_type_ids)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "return_loss": True,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class ChineseCLIPModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (ChineseCLIPModel,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"feature-extraction": ChineseCLIPModel} if is_mindspore_available() else {}
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        text_kwargs = {"use_labels": False, "batch_size": 12}
-        vision_kwargs = {"batch_size": 12}
-        self.model_tester = ChineseCLIPModelTester(self, text_kwargs, vision_kwargs)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="ChineseCLIPModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # override as the `logit_scale` parameter initilization is different for CHINESE_CLIP
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for sub_config_key in ("vision_config", "text_config"):
-            sub_config = getattr(configs_no_init, sub_config_key, {})
-            setattr(configs_no_init, sub_config_key, _config_zero_init(sub_config))
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "OFA-Sys/chinese-clip-vit-base-patch16"
-        model = ChineseCLIPModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of Pikachu
-def prepare_img():
-    url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_vision
-@require_mindspore
-class ChineseCLIPModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "OFA-Sys/chinese-clip-vit-base-patch16"
-        model = ChineseCLIPModel.from_pretrained(model_name)
-        processor = ChineseCLIPProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        inputs = processor(
-            text=["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], images=image, padding=True, return_tensors="ms"
-        )
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(
-            outputs.logits_per_image.shape,
-            (inputs.pixel_values.shape[0], inputs.input_ids.shape[0]),
-        )
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            (inputs.input_ids.shape[0], inputs.pixel_values.shape[0]),
-        )
-
-        probs = ops.softmax(outputs.logits_per_image, dim=1)
-        expected_probs = mindspore.tensor([[1.2686e-03, 5.4499e-02, 6.7968e-04, 9.4355e-01]])
-
-        self.assertTrue(ops.allclose(probs, expected_probs, atol=5e-3))
-
-    @slow
-    def test_inference_interpolate_pos_encoding(self):
-        # ViT models have an `interpolate_pos_encoding` argument in their forward method,
-        # allowing to interpolate the pre-trained position embeddings in order to use
-        # the model on higher resolutions. The DINO model by Facebook AI leverages this
-        # to visualize self-attention on higher resolution images.
-        model_name = "OFA-Sys/chinese-clip-vit-base-patch16"
-        model = ChineseCLIPModel.from_pretrained(model_name)
-
-        image_processor = ChineseCLIPProcessor.from_pretrained(
-            model_name, size={"height": 180, "width": 180}, crop_size={"height": 180, "width": 180}
-        )
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = image_processor(text="what's in the image", images=image, return_tensors="ms")
-
-        # interpolate_pos_encodiung false should return value error
-        with self.assertRaises(ValueError, msg="doesn't match model"):
-            with no_grad():
-                model(**inputs, interpolate_pos_encoding=False)
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs, interpolate_pos_encoding=True)
-
-        # verify the logits
-        expected_shape = (1, 122, 768)
-
-        self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[-0.3990, 0.2983, -0.1239], [-0.1452, -0.2759, 0.0403], [-0.3149, -0.4763, 0.8555]]
-        )
-
-        print(outputs.vision_model_output.last_hidden_state[0, :3, :3])
-        self.assertTrue(
-            ops.allclose(outputs.vision_model_output.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4)
-        )
\ No newline at end of file
diff --git a/tests/transformers/models/clap/__init__.py b/tests/transformers/models/clap/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/clap/test_feature_extraction_clap.py b/tests/transformers/models/clap/test_feature_extraction_clap.py
deleted file mode 100644
index 7555675ee..000000000
--- a/tests/transformers/models/clap/test_feature_extraction_clap.py
+++ /dev/null
@@ -1,560 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import itertools
-import random
-import unittest
-
-import numpy as np
-from datasets import load_dataset
-
-from mindnlp.transformers import ClapFeatureExtractor
-# from transformers.trainer_utils import set_seed
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-global_rng = random.Random()
-
-
-# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-def set_seed(seed: int):
-    """
-    Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch` and/or `tf` (if installed).
-
-    Args:
-        seed (`int`): The seed to set.
-    """
-    random.seed(seed)
-    np.random.seed(seed)
-    if is_mindspore_available():
-        import mindspore
-        mindspore.set_seed(seed)
-
-@require_mindspore
-# @require_mindsporeaudio
-# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester with Whisper->Clap
-class ClapFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        min_seq_length=400,
-        max_seq_length=2000,
-        feature_size=10,
-        hop_length=160,
-        chunk_length=8,
-        padding_value=0.0,
-        sampling_rate=4_000,
-        return_attention_mask=False,
-        do_normalize=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.min_seq_length = min_seq_length
-        self.max_seq_length = max_seq_length
-        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
-        self.padding_value = padding_value
-        self.sampling_rate = sampling_rate
-        self.return_attention_mask = return_attention_mask
-        self.do_normalize = do_normalize
-        self.feature_size = feature_size
-        self.chunk_length = chunk_length
-        self.hop_length = hop_length
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "feature_size": self.feature_size,
-            "hop_length": self.hop_length,
-            "chunk_length": self.chunk_length,
-            "padding_value": self.padding_value,
-            "sampling_rate": self.sampling_rate,
-            "return_attention_mask": self.return_attention_mask,
-            "do_normalize": self.do_normalize,
-        }
-
-    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
-        def _flatten(list_of_lists):
-            return list(itertools.chain(*list_of_lists))
-
-        if equal_length:
-            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
-        else:
-            # make sure that inputs increase in size
-            speech_inputs = [
-                floats_list((x, self.feature_size))
-                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
-            ]
-        if numpify:
-            speech_inputs = [np.asarray(x) for x in speech_inputs]
-        return speech_inputs
-
-
-@require_mindspore
-# @require_mindsporeaudio
-class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = ClapFeatureExtractor
-
-    # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.setUp with Whisper->Clap
-    def setUp(self):
-        self.feat_extract_tester = ClapFeatureExtractionTester(self)
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        # create three inputs of length 800, 1000, and 1200
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        # Test feature size
-        input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features
-        self.assertTrue(input_features.ndim == 4)
-
-        # Test not batched input
-        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
-        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
-
-        # Test batched
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-        # Test 2-D numpy arrays are batched.
-        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
-        np_speech_inputs = np.asarray(speech_inputs)
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-    # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_double_precision_pad
-    def test_double_precision_pad(self):
-
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
-        py_speech_inputs = np_speech_inputs.tolist()
-
-        for inputs in [py_speech_inputs, np_speech_inputs]:
-            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
-            self.assertTrue(np_processed.input_features.dtype == np.float32)
-            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="ms")
-            self.assertTrue(pt_processed.input_features.dtype == mindspore.float32)
-
-    # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest._load_datasamples
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_integration_fusion_short_input(self):
-        # fmt: off
-        EXPECTED_INPUT_FEATURES = mindspore.Tensor(
-            [
-                [
-                    # "repeat"
-                    [
-                        -20.1049, -19.9764, -20.0731, -19.5055, -27.5018, -22.5761, -26.6071,
-                        -29.0091, -26.4659, -26.4236, -28.8808, -31.9190, -32.4848, -34.1186,
-                        -34.0340, -32.8803, -30.9895, -37.6238, -38.0347, -40.6263, -36.3496,
-                        -42.2533, -32.9132, -27.7068, -29.3704, -30.3208, -22.5972, -27.1494,
-                        -30.1975, -31.1005, -29.9372, -27.1917, -25.9806, -30.3489, -33.2380,
-                        -31.9062, -36.5498, -32.8721, -30.5629, -27.4674, -22.2232, -22.5653,
-                        -16.3868, -17.2713, -25.9738, -30.6256, -34.3766, -31.1292, -27.8950,
-                        -27.0588, -25.6206, -23.0712, -26.6050, -28.0112, -32.6847, -34.3396,
-                        -34.9738, -35.8463, -39.2324, -37.1188, -33.3705, -28.9230, -28.9112,
-                        -28.6578
-                    ],
-                    [
-                        -36.7233, -30.0587, -24.8431, -18.4611, -16.8149, -23.9319, -32.8580,
-                        -34.2264, -27.4332, -26.8027, -29.2721, -33.9033, -39.3403, -35.3232,
-                        -26.8076, -28.6460, -35.2780, -36.0738, -35.4996, -37.7631, -39.5056,
-                        -34.7112, -36.8741, -34.1066, -32.9474, -33.6604, -27.9937, -30.9594,
-                        -26.2928, -32.0485, -29.2151, -29.2917, -32.7308, -29.6542, -31.1454,
-                        -37.0088, -32.3388, -37.3086, -31.1024, -27.2889, -19.6788, -21.1488,
-                        -19.5144, -14.8889, -21.2006, -24.7488, -27.7940, -31.1058, -27.5068,
-                        -21.5737, -22.3780, -21.5151, -26.3086, -30.9223, -33.5043, -32.0307,
-                        -37.3806, -41.6188, -45.6650, -40.5131, -32.5023, -26.7385, -26.3709,
-                        -26.7761
-                    ]
-                ],
-                [
-                    # "repeatpad"
-                    [
-                        -25.7496, -24.9339, -24.1357, -23.1271, -23.7853, -26.1264, -29.1456,
-                        -33.2060, -37.8179, -42.4833, -41.9386, -41.2164, -42.3566, -44.2575,
-                        -40.0217, -36.6794, -36.6974, -38.7819, -42.0880, -45.5560, -39.9368,
-                        -36.3219, -35.5981, -36.6434, -35.1851, -33.0684, -30.0437, -30.2010,
-                        -34.3476, -42.1373, -38.8039, -37.3355, -40.4576, -41.0485, -40.6377,
-                        -38.2275, -42.7481, -34.6084, -34.7048, -29.5149, -26.3935, -26.8952,
-                        -34.1336, -26.2904, -28.2571, -32.5642, -36.7240, -35.5334, -38.2451,
-                        -34.8177, -28.9754, -25.1096, -27.9768, -32.3184, -37.0269, -40.5136,
-                        -40.8061, -36.4948, -40.3767, -38.9671, -38.3552, -34.1250, -30.9035,
-                        -31.6112
-                    ],
-                    [
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100.
-                    ]
-                ],
-                [
-                    # None, same as "repeatpad"
-                    [
-                        -25.7496, -24.9339, -24.1357, -23.1271, -23.7853, -26.1264, -29.1456,
-                        -33.2060, -37.8179, -42.4833, -41.9386, -41.2164, -42.3566, -44.2575,
-                        -40.0217, -36.6794, -36.6974, -38.7819, -42.0880, -45.5560, -39.9368,
-                        -36.3219, -35.5981, -36.6434, -35.1851, -33.0684, -30.0437, -30.2010,
-                        -34.3476, -42.1373, -38.8039, -37.3355, -40.4576, -41.0485, -40.6377,
-                        -38.2275, -42.7481, -34.6084, -34.7048, -29.5149, -26.3935, -26.8952,
-                        -34.1336, -26.2904, -28.2571, -32.5642, -36.7240, -35.5334, -38.2451,
-                        -34.8177, -28.9754, -25.1096, -27.9768, -32.3184, -37.0269, -40.5136,
-                        -40.8061, -36.4948, -40.3767, -38.9671, -38.3552, -34.1250, -30.9035,
-                        -31.6112
-                    ],
-                    [
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100.
-                    ]
-                ],
-                [
-                    # "pad"
-                    [
-                        -58.5260, -58.1155, -57.8623, -57.5059, -57.9178, -58.7171, -59.2343,
-                        -59.9833, -60.9764, -62.0722, -63.5723, -65.7111, -67.5153, -68.7088,
-                        -69.8325, -70.2987, -70.1548, -70.6233, -71.5702, -72.5159, -72.3821,
-                        -70.1817, -67.0315, -64.1387, -62.2202, -61.0717, -60.4951, -61.6005,
-                        -63.7358, -67.1400, -67.6185, -65.5635, -64.3593, -63.7138, -63.6209,
-                        -66.4950, -72.6284, -63.3961, -56.8334, -52.7319, -50.6310, -51.3728,
-                        -53.5619, -51.9190, -50.9708, -52.8684, -55.8073, -58.8227, -60.6991,
-                        -57.0547, -52.7611, -51.4388, -54.4892, -60.8950, -66.1024, -72.4352,
-                        -67.8538, -65.1463, -68.7588, -72.3080, -68.4864, -60.4688, -57.1516,
-                        -60.9460
-                    ],
-                    [
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100.
-                    ]
-                ]
-            ]
-        )
-        # fmt: on
-        MEL_BIN = [[976, 977], [976, 977], [976, 977], [196, 197]]
-        input_speech = self._load_datasamples(1)
-        feature_extractor = ClapFeatureExtractor()
-        for padding, EXPECTED_VALUES, idx_in_mel in zip(
-            ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, MEL_BIN
-        ):
-            input_features = feature_extractor(input_speech, return_tensors="ms", padding=padding).input_features
-            self.assertEqual(input_features.shape, (1, 4, 1001, 64))
-
-            self.assertTrue(np.allclose(input_features[0, 0, idx_in_mel[0]].asnumpy(), EXPECTED_VALUES[0].asnumpy(), atol=1e-4))
-            self.assertTrue(np.allclose(input_features[0, 0, idx_in_mel[1]].asnumpy(), EXPECTED_VALUES[1].asnumpy(), atol=1e-4))
-
-            self.assertTrue(ops.all(input_features[0, 0] == input_features[0, 1]))
-            self.assertTrue(ops.all(input_features[0, 0] == input_features[0, 2]))
-            self.assertTrue(ops.all(input_features[0, 0] == input_features[0, 3]))
-
-    def test_integration_rand_trunc_short_input(self):
-        # fmt: off
-        EXPECTED_INPUT_FEATURES = mindspore.Tensor(
-            [
-                [
-                    # "repeat"
-                    [
-                        -35.0483, -35.7865, -38.2884, -40.0220, -42.5349, -44.9489, -43.2228,
-                        -44.6499, -47.6253, -49.6983, -50.2127, -52.5483, -52.2223, -51.9157,
-                        -49.4082, -51.2024, -57.0476, -56.2803, -58.1618, -60.7474, -55.0389,
-                        -60.9514, -59.3080, -50.4419, -47.8172, -48.7570, -55.2552, -44.5036,
-                        -44.1148, -50.8218, -51.0968, -52.9408, -51.1037, -48.9789, -47.5897,
-                        -52.0915, -55.4216, -54.1529, -58.0149, -58.0866, -52.7798, -52.6154,
-                        -45.9144, -46.2008, -40.7603, -41.1703, -50.2250, -55.4112, -59.4818,
-                        -54.5795, -53.5552, -51.3668, -49.8358, -50.3186, -54.0452, -57.6030,
-                        -61.1589, -61.6415, -63.2756, -66.5890, -62.8543, -58.0665, -56.7203,
-                        -56.7632
-                    ],
-                    [
-                        -47.1320, -37.9961, -34.0076, -36.7109, -47.9057, -48.4924, -43.8371,
-                        -44.9728, -48.1689, -52.9141, -57.6077, -52.8520, -44.8502, -45.6764,
-                        -51.8389, -56.4284, -54.6972, -53.4889, -55.6077, -58.7149, -60.3760,
-                        -54.0136, -56.0730, -55.9870, -54.4017, -53.1094, -53.5640, -50.3064,
-                        -49.9520, -49.3239, -48.1668, -53.4852, -50.4561, -50.8688, -55.1970,
-                        -51.5538, -53.0260, -59.6933, -54.8183, -59.5895, -55.9589, -50.3761,
-                        -44.1282, -44.1463, -43.8540, -39.1168, -45.3893, -49.5542, -53.1505,
-                        -55.2870, -50.3921, -46.8511, -47.4444, -49.5633, -56.0034, -59.0815,
-                        -59.0018, -63.7589, -69.5745, -71.5789, -64.0498, -56.0558, -54.3475,
-                        -54.7004
-                    ]
-                ],
-                [
-                    # "repeatpad"
-                    [
-                        -40.3184, -39.7186, -39.8807, -41.6508, -45.3613, -50.4785, -57.0297,
-                        -60.4944, -59.1642, -58.9495, -60.4661, -62.5300, -58.4759, -55.2865,
-                        -54.8973, -56.0780, -57.5482, -59.6557, -64.3309, -65.0330, -59.4941,
-                        -56.8552, -55.0519, -55.9817, -56.9739, -55.2827, -54.5312, -51.4141,
-                        -50.4289, -51.9131, -57.5821, -63.9979, -59.9180, -58.9489, -62.3247,
-                        -62.6975, -63.7948, -60.5250, -64.6107, -58.7905, -57.0229, -54.3084,
-                        -49.8445, -50.4459, -57.0172, -50.6425, -52.5992, -57.4207, -61.6358,
-                        -60.6540, -63.1968, -57.4360, -52.3263, -51.7695, -57.1946, -62.9610,
-                        -66.7359, -67.0335, -63.7440, -68.1775, -66.3798, -62.8650, -59.8972,
-                        -59.3139
-                    ],
-                    [
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100.
-                    ]
-                ],
-                [
-                    # None, same as "repeatpad"
-                    [
-                        -40.3184, -39.7186, -39.8807, -41.6508, -45.3613, -50.4785, -57.0297,
-                        -60.4944, -59.1642, -58.9495, -60.4661, -62.5300, -58.4759, -55.2865,
-                        -54.8973, -56.0780, -57.5482, -59.6557, -64.3309, -65.0330, -59.4941,
-                        -56.8552, -55.0519, -55.9817, -56.9739, -55.2827, -54.5312, -51.4141,
-                        -50.4289, -51.9131, -57.5821, -63.9979, -59.9180, -58.9489, -62.3247,
-                        -62.6975, -63.7948, -60.5250, -64.6107, -58.7905, -57.0229, -54.3084,
-                        -49.8445, -50.4459, -57.0172, -50.6425, -52.5992, -57.4207, -61.6358,
-                        -60.6540, -63.1968, -57.4360, -52.3263, -51.7695, -57.1946, -62.9610,
-                        -66.7359, -67.0335, -63.7440, -68.1775, -66.3798, -62.8650, -59.8972,
-                        -59.3139
-                    ],
-                    [
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100.
-                    ]
-                ],
-                [
-                    # "pad"
-                    [
-                        -73.3190, -73.6349, -74.1451, -74.8539, -75.7476, -76.5438, -78.5540,
-                        -80.1339, -81.8911, -83.7560, -85.5387, -86.7466, -88.2072, -88.6090,
-                        -88.8243, -89.0784, -89.4364, -89.8179, -91.3146, -92.2833, -91.7221,
-                        -90.9440, -88.1315, -86.2425, -84.2281, -82.4893, -81.5993, -81.1328,
-                        -81.5759, -83.1068, -85.6525, -88.9520, -88.9187, -87.2703, -86.3052,
-                        -85.7188, -85.8802, -87.9996, -95.0464, -88.0133, -80.8561, -76.5597,
-                        -74.2816, -74.8109, -77.3615, -76.0719, -75.3426, -77.6428, -80.9663,
-                        -84.5275, -84.9907, -80.5205, -77.2851, -78.6259, -84.7740, -91.4535,
-                        -98.1894, -94.3872, -92.3735, -97.6807, -98.1501, -91.4344, -85.2842,
-                        -88.4338
-                    ],
-                    [
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100., -100., -100., -100., -100., -100., -100.,
-                        -100., -100., -100., -100.
-                    ]
-                ]
-            ]
-        )
-        # fmt: on
-        MEL_BIN = [[976, 977], [976, 977], [976, 977], [196, 197]]
-        input_speech = self._load_datasamples(1)
-        feature_extractor = ClapFeatureExtractor()
-        for padding, EXPECTED_VALUES, idx_in_mel in zip(
-            ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, MEL_BIN
-        ):
-            input_features = feature_extractor(
-                input_speech, return_tensors="ms", truncation="rand_trunc", padding=padding
-            ).input_features
-            self.assertEqual(input_features.shape, (1, 1, 1001, 64))
-            self.assertTrue(np.allclose(input_features[0, 0, idx_in_mel[0]].asnumpy(), EXPECTED_VALUES[0].asnumpy(), atol=1e-4))
-            self.assertTrue(np.allclose(input_features[0, 0, idx_in_mel[1]].asnumpy(), EXPECTED_VALUES[1].asnumpy(), atol=1e-4))
-
-    @slow
-    def test_integration_fusion_long_input(self):
-        # fmt: off
-        EXPECTED_INPUT_FEATURES = mindspore.Tensor(
-            [
-                [
-                    -11.1830, -10.1894, -8.6051, -4.8578, -1.3268, -8.4606, -14.5453,
-                     -9.2017, 0.5781, 16.2129, 14.8289, 3.6326, -3.8794, -6.5544,
-                     -2.4408, 1.9531, 6.0967, 1.7590, -7.6730, -6.1571, 2.0052,
-                     16.6694, 20.6447, 21.2145, 13.4972, 15.9043, 16.8987, 4.1766,
-                     11.9428, 21.2372, 12.3016, 4.8604, 6.7241, 1.8543, 4.9235,
-                      5.3188, -0.9897, -1.2416, -6.5864, 2.9529, 2.9274, 6.4753,
-                     10.2300, 11.2127, 3.4042, -1.0055, -6.0475, -6.7524, -3.9801,
-                     -1.4434, 0.4740, -0.1584, -4.5457, -8.5746, -8.8428, -13.1475,
-                     -9.6079, -8.5798, -4.1143, -3.7966, -7.1651, -6.1517, -8.0258,
-                    -12.1486
-                ],
-                [
-                    -10.2017, -7.9924, -5.9517, -3.9372, -1.9735, -4.3130, 16.1647,
-                     25.0592, 23.5532, 14.4974, -7.0778, -10.2262, 6.4782, 20.3454,
-                     19.4269, 1.7976, -16.5070, 4.9380, 12.3390, 6.9285, -13.6325,
-                     -8.5298, 1.0839, -5.9629, -8.4812, 3.1331, -2.0963, -16.6046,
-                    -14.0070, -17.5707, -13.2080, -17.2168, -17.7770, -12.1111, -18.6184,
-                    -17.1897, -13.9801, -12.0426, -23.5400, -25.6823, -23.5813, -18.7847,
-                    -20.5473, -25.6458, -19.7585, -27.6007, -28.9276, -24.8948, -25.4458,
-                    -22.2807, -19.6613, -19.2669, -15.7813, -19.6821, -24.3439, -22.2598,
-                    -28.2631, -30.1017, -32.7646, -33.6525, -27.5639, -22.0548, -27.8054,
-                    -29.6947
-                ],
-                [
-                    -9.2078, -7.2963, -6.2095, -7.9959, -2.9280, -11.1843, -6.1490,
-                    5.0733, 19.2957, 21.4578, 14.6803, -3.3153, -6.3334, -2.3542,
-                    6.9509, 15.2965, 14.6620, 5.2075, -0.0873, 1.1919, 18.1986,
-                    20.8470, 10.8035, 2.2516, 7.6905, 7.7427, -1.2543, -5.0018,
-                    0.9809, -2.1584, -5.4580, -5.4760, -11.8888, -9.0605, -8.4638,
-                    -9.9897, -0.0540, -5.1629, 0.0483, -4.1504, -4.8140, -7.8236,
-                    -9.0622, -10.1742, -8.9597, -11.5380, -16.5603, -17.1858, -17.5032,
-                    -20.9326, -23.9543, -25.2602, -25.3429, -27.4536, -26.8859, -22.7852,
-                    -25.8288, -24.8399, -23.8893, -24.2096, -26.5415, -23.7281, -25.6851,
-                    -22.3629
-                ],
-                [
-                      1.3448, 2.9883, 4.0366, -0.8019, -10.4191, -10.0883, -4.3812,
-                      0.8136, 2.1579, 0.0832, 1.0949, -0.9759, -5.5319, -4.6009,
-                     -6.5452, -14.9155, -20.1584, -9.3611, -2.4271, 1.4031, 4.9910,
-                      8.6916, 8.6785, 10.1973, 9.9029, 5.3840, 7.5336, 5.2803,
-                      2.8144, -0.3138, 2.2216, 5.7328, 7.5574, 7.7402, 1.0681,
-                      3.1049, 7.0742, 6.5588, 7.3712, 5.7881, 8.6874, 8.7725,
-                      2.8133, -4.5809, -6.1317, -5.1719, -5.0192, -9.0977, -10.9391,
-                     -6.0769, 1.6016, -0.8965, -7.2252, -7.8632, -11.4468, -11.7446,
-                    -10.7447, -7.0601, -2.7748, -4.1798, -2.8433, -3.1352, 0.8097,
-                      6.4212
-                ]
-            ]
-        )
-        # fmt: on
-        MEL_BIN = 963
-        input_speech = ops.cat([mindspore.Tensor(x) for x in self._load_datasamples(5)])
-        feature_extractor = ClapFeatureExtractor()
-        for padding, EXPECTED_VALUES, block_idx in zip(
-            ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, [1, 2, 0, 3]
-        ):
-            set_seed(987654321)
-            input_features = feature_extractor(input_speech, return_tensors="ms", padding=padding).input_features
-            self.assertEqual(input_features.shape, (1, 4, 1001, 64))
-            self.assertTrue(np.allclose(input_features[0, block_idx, MEL_BIN].asnumpy(), EXPECTED_VALUES.asnumpy(), atol=1e-3))
-
-    @slow
-    def test_integration_rand_trunc_long_input(self):
-        # fmt: off
-        EXPECTED_INPUT_FEATURES = mindspore.Tensor(
-            [
-                [
-                    -35.4022, -32.7555, -31.2004, -32.7764, -42.5770, -41.6339, -43.1630,
-                    -44.5080, -44.3029, -48.9628, -39.5022, -39.2105, -43.1350, -43.2195,
-                    -48.4894, -52.2344, -57.6891, -52.2228, -45.5155, -44.2893, -43.4697,
-                    -46.6702, -43.7490, -40.4819, -42.7275, -46.3434, -46.8412, -41.2003,
-                    -43.1681, -46.2948, -46.1925, -47.8333, -45.6812, -44.9182, -41.7786,
-                    -43.3809, -44.3199, -42.8814, -45.4771, -46.7114, -46.9746, -42.7090,
-                    -41.6057, -38.3965, -40.1980, -41.0263, -34.1256, -28.3289, -29.0201,
-                    -30.4453, -29.5561, -30.1734, -25.9406, -19.0897, -15.8452, -20.1351,
-                    -23.6515, -23.1194, -17.1845, -19.4399, -23.6527, -22.8768, -20.7279,
-                    -22.7864
-                ],
-                [
-                    -35.7719, -27.2566, -23.6964, -27.5521, 0.2510, 7.4391, 1.3917,
-                    -13.3417, -28.1758, -17.0856, -5.7723, -0.8000, -7.8832, -15.5548,
-                    -30.5935, -24.7571, -13.7009, -10.3432, -21.2464, -24.8118, -19.4080,
-                    -14.9779, -11.7991, -18.4485, -20.1982, -17.3652, -20.6328, -28.2967,
-                    -25.7819, -21.8962, -28.5083, -29.5719, -30.2120, -35.7033, -31.8218,
-                    -34.0408, -37.7744, -33.9653, -31.3009, -30.9063, -28.6153, -32.2202,
-                    -28.5456, -28.8579, -32.5170, -37.9152, -43.0052, -46.4849, -44.0786,
-                    -39.1933, -33.2757, -31.6313, -42.6386, -52.3679, -53.5785, -55.6444,
-                    -47.0050, -47.6459, -56.6361, -60.6781, -61.5244, -55.8272, -60.4832,
-                    -58.1897
-                ],
-                [
-                    -38.2686, -36.6285, -32.5835, -35.1693, -37.7938, -37.4035, -35.3132,
-                    -35.6083, -36.3609, -40.9472, -36.7846, -36.1544, -38.9076, -39.3618,
-                    -35.4953, -34.2809, -39.9466, -39.7433, -34.8347, -37.5674, -41.5689,
-                    -38.9161, -34.3947, -30.2924, -30.4841, -34.5831, -28.9261, -24.8849,
-                    -31.2324, -27.1622, -27.2107, -25.9385, -30.1691, -30.9223, -23.9495,
-                    -25.6047, -26.7119, -28.5523, -27.7481, -32.8427, -35.4650, -31.0399,
-                    -31.2073, -30.5163, -22.9819, -20.8892, -19.2510, -24.7905, -28.9426,
-                    -28.1998, -26.7386, -25.0140, -27.9223, -32.9913, -33.1864, -34.9742,
-                    -38.5995, -39.6990, -29.3203, -22.4697, -25.6415, -33.5608, -33.0945,
-                    -27.1716
-                ],
-                [
-                    -33.2015, -28.7741, -21.9457, -23.4888, -32.1072, -8.6307, 3.2724,
-                      5.9157, -0.9221, -30.1814, -31.0015, -27.4508, -27.0477, -9.5342,
-                      0.3221, 0.6511, -7.1596, -25.9707, -32.8924, -32.2300, -13.8974,
-                     -0.4895, 0.9168, -10.7663, -27.1176, -35.0829, -11.6859, -4.8855,
-                    -11.8898, -26.6167, -5.6192, -3.8443, -19.7947, -14.4101, -8.6236,
-                    -21.2458, -21.0801, -17.9136, -24.4663, -18.6333, -24.8085, -15.5854,
-                    -15.4344, -11.5046, -22.3625, -27.3387, -32.4353, -30.9670, -31.3789,
-                    -35.4044, -34.4591, -25.2433, -28.0773, -33.8736, -33.0224, -33.3155,
-                    -38.5302, -39.2741, -36.6395, -34.7729, -32.4483, -42.4001, -49.2857,
-                    -39.1682
-                ]
-            ]
-        )
-        # fmt: on
-        MEL_BIN = 963
-        SEEDS = [987654321, 1234, 666, 5555]
-        input_speech = ops.cat([mindspore.Tensor(x) for x in self._load_datasamples(5)])
-        feature_extractor = ClapFeatureExtractor()
-        for padding, EXPECTED_VALUES, seed in zip(
-            ["repeat", "repeatpad", None, "pad"], EXPECTED_INPUT_FEATURES, SEEDS
-        ):
-            set_seed(seed)
-            input_features = feature_extractor(
-                input_speech, return_tensors="ms", truncation="rand_trunc", padding=padding
-            ).input_features
-            self.assertEqual(input_features.shape, (1, 1, 1001, 64))
-            self.assertTrue(np.allclose(input_features[0, 0, MEL_BIN].asnumpy(), EXPECTED_VALUES.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/clap/test_modeling_clap.py b/tests/transformers/models/clap/test_modeling_clap.py
deleted file mode 100644
index e6cd21550..000000000
--- a/tests/transformers/models/clap/test_modeling_clap.py
+++ /dev/null
@@ -1,717 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore CLAP model."""
-
-import inspect
-import os
-import tempfile
-import unittest
-
-import numpy as np
-from datasets import load_dataset
-
-from mindnlp.transformers import ClapAudioConfig, ClapConfig, ClapProcessor, ClapTextConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow
-from mindnlp.utils import is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        ClapAudioModel,
-        ClapAudioModelWithProjection,
-        ClapModel,
-        ClapTextModel,
-        ClapTextModelWithProjection,
-    )
-
-
-class ClapAudioModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=60,
-        num_mel_bins=16,
-        window_size=4,
-        spec_size=64,
-        patch_size=2,
-        patch_stride=2,
-        seq_length=16,
-        freq_ratio=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        patch_embeds_hidden_size=16,
-        projection_dim=32,
-        depths=[2, 2],
-        num_hidden_layers=2,
-        num_heads=[2, 2],
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_mel_bins = num_mel_bins
-        self.window_size = window_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.depths = depths
-        self.num_heads = num_heads
-        self.num_attention_heads = num_heads[0]
-        self.seq_length = seq_length
-        self.spec_size = spec_size
-        self.freq_ratio = freq_ratio
-        self.patch_stride = patch_stride
-        self.patch_embeds_hidden_size = patch_embeds_hidden_size
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor([self.batch_size, 1, self.hidden_size, self.num_mel_bins])
-        config = self.get_config()
-
-        return config, input_features
-
-    def get_config(self):
-        return ClapAudioConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_mel_bins=self.num_mel_bins,
-            window_size=self.window_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            patch_stride=self.patch_stride,
-            projection_dim=self.projection_dim,
-            depths=self.depths,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-            spec_size=self.spec_size,
-            freq_ratio=self.freq_ratio,
-            patch_embeds_hidden_size=self.patch_embeds_hidden_size,
-        )
-
-    def create_and_check_model(self, config, input_features):
-        model = ClapAudioModel(config=config)
-        model.eval()
-        with no_grad():
-            result = model(input_features)
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_with_projection(self, config, input_features):
-        model = ClapAudioModelWithProjection(config=config)
-        model.eval()
-        with no_grad():
-            result = model(input_features)
-        self.parent.assertEqual(result.audio_embeds.shape, (self.batch_size, self.projection_dim))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_features = config_and_inputs
-        inputs_dict = {"input_features": input_features}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ClapAudioModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as CLAP does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (ClapAudioModel, ClapAudioModelWithProjection) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = ClapAudioModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ClapAudioConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="ClapAudioModel does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [2 * self.model_tester.patch_embeds_hidden_size, 2 * self.model_tester.patch_embeds_hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_features"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_projection(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
-
-    @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="ClapAudioModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="ClapAudioModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "laion/clap-htsat-fused"
-        model = ClapAudioModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_model_with_projection_from_pretrained(self):
-        model_name = "laion/clap-htsat-fused"
-        model = ClapAudioModelWithProjection.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-        self.assertTrue(hasattr(model, "audio_projection"))
-
-    @unittest.skip('CPU has precision error')
-    def test_determinism(self):
-        pass
-
-    @unittest.skip('CPU has precision error')
-    def test_model_outputs_equivalence(self):
-        pass
-
-    @unittest.skip('CPU has precision error')
-    def test_save_load(self):
-        pass
-
-    @unittest.skip('CPU has precision error')
-    def test_feed_forward_chunking(self):
-        pass
-
-
-class ClapTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        scope=None,
-        projection_hidden_act="relu",
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.projection_hidden_act = projection_hidden_act
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[batch_idx, :int(start_index)] = 1
-                input_mask[batch_idx, int(start_index):] = 0
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return ClapTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            projection_hidden_act=self.projection_hidden_act,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = ClapTextModel(config=config)
-        model.eval()
-        with no_grad():
-            result = model(input_ids, attention_mask=input_mask)
-            result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_with_projection(self, config, input_ids, input_mask):
-        model = ClapTextModelWithProjection(config=config)
-        model.eval()
-        with no_grad():
-            result = model(input_ids, attention_mask=input_mask)
-            result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.text_embeds.shape, (self.batch_size, self.projection_dim))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ClapTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (ClapTextModel, ClapTextModelWithProjection) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = ClapTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ClapTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_projection(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
-
-    @unittest.skip(reason="ClapTextModel does not output any loss term in the forward pass")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="ClapTextModel does not output any loss term in the forward pass")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="ClapTextModel does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="ClapTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="ClapTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "laion/clap-htsat-fused"
-        model = ClapTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_model_with_projection_from_pretrained(self):
-        model_name = "laion/clap-htsat-fused"
-        model = ClapTextModelWithProjection.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-        self.assertTrue(hasattr(model, "text_projection"))
-
-
-class ClapModelTester:
-    def __init__(self, parent, text_kwargs=None, audio_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if audio_kwargs is None:
-            audio_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = ClapTextModelTester(parent, **text_kwargs)
-        self.audio_model_tester = ClapAudioModelTester(parent, **audio_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        _, input_features = self.audio_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, input_features
-
-    def get_config(self):
-        return ClapConfig.from_text_audio_configs(
-            self.text_model_tester.get_config(), self.audio_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, input_features):
-        model = ClapModel(config).eval()
-        with no_grad():
-            result = model(input_ids, input_features, attention_mask)
-        self.parent.assertEqual(
-            result.logits_per_audio.shape, (self.audio_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.audio_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, input_features = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "input_features": input_features,
-            "return_loss": True,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class ClapModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (ClapModel,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"feature-extraction": ClapModel} if is_mindspore_available() else {}
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = ClapModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="ClapModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip('CPU has precision error')
-    def test_determinism(self):
-        pass
-
-    @unittest.skip('CPU has precision error')
-    def test_model_outputs_equivalence(self):
-        pass
-
-    @unittest.skip('CPU has precision error')
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip('CPU has precision error')
-    def test_save_load(self):
-        pass
-
-    # override as the `logit_scale` parameter initilization is different for CLAP
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def test_load_audio_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save ClapConfig and check if we can load ClapAudioConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            audio_config = ClapAudioConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.audio_config.to_dict(), audio_config.to_dict())
-
-        # Save ClapConfig and check if we can load ClapTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = ClapTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "laion/clap-htsat-fused"
-        model = ClapModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@slow
-@require_mindspore
-class ClapModelIntegrationTest(unittest.TestCase):
-    paddings = ["repeatpad", "repeat", "pad"]
-
-    def test_integration_unfused(self):
-        EXPECTED_MEANS_UNFUSED = {
-            "repeatpad": 0.0024,
-            "pad": 0.0020,
-            "repeat": 0.0023,
-        }
-
-        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        audio_sample = librispeech_dummy[-1]
-
-        model_id = "laion/clap-htsat-unfused"
-
-        model = ClapModel.from_pretrained(model_id)
-        processor = ClapProcessor.from_pretrained(model_id)
-
-        for padding in self.paddings:
-            inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="ms", padding=padding)
-
-            audio_embed = model.get_audio_features(**inputs)
-            expected_mean = EXPECTED_MEANS_UNFUSED[padding]
-
-            self.assertTrue(
-                ops.allclose(audio_embed.mean(), mindspore.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
-            )
-
-    def test_integration_fused(self):
-        EXPECTED_MEANS_FUSED = {
-            "repeatpad": 0.00069,
-            "repeat": 0.00196,
-            "pad": -0.000379,
-        }
-
-        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        audio_sample = librispeech_dummy[-1]
-
-        model_id = "laion/clap-htsat-fused"
-
-        model = ClapModel.from_pretrained(model_id)
-        processor = ClapProcessor.from_pretrained(model_id)
-
-        for padding in self.paddings:
-            inputs = processor(
-                audios=audio_sample["audio"]["array"], return_tensors="ms", padding=padding, truncation="fusion"
-            )
-
-            audio_embed = model.get_audio_features(**inputs)
-            expected_mean = EXPECTED_MEANS_FUSED[padding]
-
-            self.assertTrue(
-                ops.allclose(audio_embed.mean(), mindspore.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
-            )
-
-    def test_batched_fused(self):
-        EXPECTED_MEANS_FUSED = {
-            "repeatpad": 0.0010,
-            "repeat": 0.0020,
-            "pad": 0.0006,
-        }
-
-        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]
-
-        model_id = "laion/clap-htsat-fused"
-
-        model = ClapModel.from_pretrained(model_id)
-        processor = ClapProcessor.from_pretrained(model_id)
-
-        for padding in self.paddings:
-            inputs = processor(audios=audio_samples, return_tensors="ms", padding=padding, truncation="fusion")
-
-            audio_embed = model.get_audio_features(**inputs)
-            expected_mean = EXPECTED_MEANS_FUSED[padding]
-
-            self.assertTrue(
-                ops.allclose(audio_embed.mean(), mindspore.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
-            )
-
-    def test_batched_unfused(self):
-        EXPECTED_MEANS_FUSED = {
-            "repeatpad": 0.0016,
-            "repeat": 0.0019,
-            "pad": 0.0019,
-        }
-
-        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]
-
-        model_id = "laion/clap-htsat-unfused"
-
-        model = ClapModel.from_pretrained(model_id)
-        processor = ClapProcessor.from_pretrained(model_id)
-
-        for padding in self.paddings:
-            inputs = processor(audios=audio_samples, return_tensors="ms", padding=padding)
-
-            audio_embed = model.get_audio_features(**inputs)
-            expected_mean = EXPECTED_MEANS_FUSED[padding]
-
-            self.assertTrue(
-                ops.allclose(audio_embed.mean(), mindspore.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
-            )
\ No newline at end of file
diff --git a/tests/transformers/models/clap/test_processor_clap.py b/tests/transformers/models/clap/test_processor_clap.py
deleted file mode 100644
index 568629ffb..000000000
--- a/tests/transformers/models/clap/test_processor_clap.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import shutil
-import tempfile
-import unittest
-
-from mindnlp.transformers import ClapFeatureExtractor, ClapProcessor, RobertaTokenizer, RobertaTokenizerFast
-from mindnlp.utils.testing_utils import require_sentencepiece, slow
-
-from .test_feature_extraction_clap import floats_list
-
-
-# @require_torchaudio
-@require_sentencepiece
-class ClapProcessorTest(unittest.TestCase):
-    def setUp(self):
-        self.checkpoint = "laion/clap-htsat-unfused"
-        self.tmpdirname = tempfile.mkdtemp()
-
-    def get_tokenizer(self, **kwargs):
-        return RobertaTokenizer.from_pretrained(self.checkpoint, **kwargs)
-
-    def get_feature_extractor(self, **kwargs):
-        return ClapFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def test_save_load_pretrained_default(self):
-        tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        processor.save_pretrained(self.tmpdirname)
-        processor = ClapProcessor.from_pretrained(self.tmpdirname)
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, ClapFeatureExtractor)
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = ClapProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
-
-        processor = ClapProcessor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, ClapFeatureExtractor)
-
-    def test_feature_extractor(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        raw_speech = floats_list((3, 1000))
-
-        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
-        input_processor = processor(audios=raw_speech, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_tokenizer(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        input_str = "This is a test string"
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tok = tokenizer(input_str)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_tokenizer_decode(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        self.assertListEqual(
-            processor.model_input_names[2:],
-            feature_extractor.model_input_names,
-            msg="`processor` and `feature_extractor` model input names do not match",
-        )
diff --git a/tests/transformers/models/clip/__init__.py b/tests/transformers/models/clip/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/clip/test_modeling_clip.py b/tests/transformers/models/clip/test_modeling_clip.py
deleted file mode 100644
index 63e06697f..000000000
--- a/tests/transformers/models/clip/test_modeling_clip.py
+++ /dev/null
@@ -1,668 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore CLIP model."""
-
-import inspect
-import tempfile
-import unittest
-
-import numpy as np
-import requests
-
-from mindnlp.transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import (
-    is_mindspore_available,
-    is_vision_available,
-)
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    is_flaky,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        CLIPForImageClassification,
-        CLIPModel,
-        CLIPTextModel,
-        CLIPTextModelWithProjection,
-        CLIPVisionModel,
-        CLIPVisionModelWithProjection,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import CLIPProcessor
-
-
-class CLIPVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return CLIPVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = CLIPVisionModel(config=config)
-        model.eval()
-        with no_grad():
-            result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_with_projection(self, config, pixel_values):
-        model = CLIPVisionModelWithProjection(config=config)
-        model.eval()
-        with no_grad():
-            result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.image_embeds.shape, (self.batch_size, self.projection_dim))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class CLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (CLIPVisionModel, CLIPVisionModelWithProjection) if is_mindspore_available() else ()
-    fx_compatible = True
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = CLIPVisionModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="CLIP does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_projection(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
-
-    @unittest.skip
-    def test_training(self):
-        pass
-
-    @unittest.skip
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="CLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="CLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = CLIPVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_model_with_projection_from_pretrained(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = CLIPVisionModelWithProjection.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-        self.assertTrue(hasattr(model, "visual_projection"))
-
-
-class CLIPTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[batch_idx, :int(start_index)] = 1
-                input_mask[batch_idx, int(start_index):] = 0
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return CLIPTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = CLIPTextModel(config=config)
-        model.eval()
-        with no_grad():
-            result = model(input_ids, attention_mask=input_mask)
-            result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_with_projection(self, config, input_ids, input_mask):
-        model = CLIPTextModelWithProjection(config=config)
-        model.eval()
-        with no_grad():
-            result = model(input_ids, attention_mask=input_mask)
-            result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.text_embeds.shape, (self.batch_size, self.projection_dim))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class CLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (CLIPTextModel, CLIPTextModelWithProjection) if is_mindspore_available() else ()
-    fx_compatible = True
-    test_pruning = False
-    test_head_masking = False
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = CLIPTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_projection(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
-
-    @unittest.skip
-    def test_training(self):
-        pass
-
-    @unittest.skip
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="CLIP does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="CLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="CLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = CLIPTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_model_with_projection_from_pretrained(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = CLIPTextModelWithProjection.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-        self.assertTrue(hasattr(model, "text_projection"))
-
-
-class CLIPModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = CLIPTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = CLIPVisionModelTester(parent, **vision_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return CLIPConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = CLIPModel(config).eval()
-        with no_grad():
-            result = model(input_ids, pixel_values, attention_mask)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "return_loss": True,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class CLIPModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (CLIPModel,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": CLIPModel, "image-feature-extraction": CLIPVisionModel} if is_mindspore_available() else {}
-    )
-    fx_compatible = True
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = CLIPModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="CLIPModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # override as the `logit_scale` parameter initilization is different for CLIP
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save CLIPConfig and check if we can load CLIPVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = CLIPVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save CLIPConfig and check if we can load CLIPTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = CLIPTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = CLIPModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class CLIPForImageClassificationModelTester(CLIPModelTester):
-    def __init__(self, parent):
-        super().__init__(parent)
-        self.batch_size = self.vision_model_tester.batch_size
-        self.num_hidden_layers = self.vision_model_tester.num_hidden_layers
-        self.hidden_size = self.vision_model_tester.hidden_size
-        self.seq_length = self.vision_model_tester.seq_length
-
-    def prepare_config_and_inputs(self):
-        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class CLIPForImageClassificationModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (CLIPForImageClassification,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"image-classification": CLIPForImageClassification} if is_mindspore_available() else {}
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = CLIPForImageClassificationModelTester(self)
-
-    @unittest.skip(reason="CLIPForImageClassification does not support inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="CLIPForImageClassification does not support inputs_embeds")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet")
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet")
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="CLIP uses the same initialization scheme as the Flax original implementation")
-    def test_initialization(self):
-        pass
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_vision
-@require_mindspore
-class CLIPModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "openai/clip-vit-base-patch32"
-        model = CLIPModel.from_pretrained(model_name)
-        processor = CLIPProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        inputs = processor(
-            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="ms"
-        )
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(
-            outputs.logits_per_image.shape,
-            (inputs.pixel_values.shape[0], inputs.input_ids.shape[0]),
-        )
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            (inputs.input_ids.shape[0], inputs.pixel_values.shape[0]),
-        )
-
-        expected_logits = mindspore.tensor([[24.5701, 19.3049]])
-        print(outputs.logits_per_image)
-
-        self.assertTrue(ops.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
-
-    @slow
-    def test_inference_time(self):
-        import time
-        model_name = "openai/clip-vit-base-patch32"
-        model = CLIPModel.from_pretrained(model_name)
-        processor = CLIPProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        inputs = processor(
-            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="ms"
-        )
-
-        infer_time = []
-        # forward pass
-        with no_grad():
-            for i in range(20):
-                s = time.time()
-                outputs = model(**inputs)
-                t = time.time()
-                infer_time.append(t - s)
-
-        print(infer_time)
diff --git a/tests/transformers/models/clip/test_tokenization_clip.py b/tests/transformers/models/clip/test_tokenization_clip.py
deleted file mode 100644
index e8f6d099e..000000000
--- a/tests/transformers/models/clip/test_tokenization_clip.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import unittest
-
-from mindnlp.transformers import CLIPTokenizer, CLIPTokenizerFast
-from mindnlp.transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
-from mindnlp.utils.testing_utils import require_ftfy, require_tokenizers
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-@require_tokenizers
-class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "openai/clip-vit-base-patch32"
-    tokenizer_class = CLIPTokenizer
-    rust_tokenizer_class = CLIPTokenizerFast
-    test_rust_tokenizer = True
-    from_pretrained_kwargs = {}
-    test_seq2seq = False
-
-    def setUp(self):
-        super().setUp()
-
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]  # fmt: skip
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = CLIPTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "lower newer"
-        bpe_tokens = ["lo", "w", "er</w>", "n", "e", "w", "er</w>"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [10, 2, 16, 9, 3, 2, 16, 20]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    @require_ftfy
-    def test_check_encoding_slow_fast(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
-                text_tokenized_s = tokenizer_s.tokenize(text)
-                text_tokenized_r = tokenizer_r.tokenize(text)
-
-                self.assertListEqual(text_tokenized_s, text_tokenized_r)
-
-                # Test that the tokenization is identical on an example containing a character (Latin Small Letter A
-                # with Tilde) encoded in 2 different ways
-                text = "xa\u0303y" + " " + "x\xe3y"
-                text_tokenized_s = tokenizer_s.tokenize(text)
-                text_tokenized_r = tokenizer_r.tokenize(text)
-
-                self.assertListEqual(text_tokenized_s, text_tokenized_r)
-
-                # Test that the tokenization is identical on unicode of space type
-                spaces_unicodes = [
-                    "\u0009",  # (horizontal tab, '\t')
-                    "\u000b",  # (vertical tab)
-                    "\u000c",  # (form feed)
-                    "\u0020",  # (space, ' ')
-                    "\u200e",  # (left-to-right mark):w
-                    "\u200f",  # (right-to-left mark)
-                ]
-                for unicode_seq in spaces_unicodes:
-                    text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
-                    text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
-
-                    self.assertListEqual(text_tokenized_s, text_tokenized_r)
-
-                # Test that the tokenization is identical on unicode of line break type
-                line_break_unicodes = [
-                    "\u000a",  # (line feed, '\n')
-                    "\r\n",  # (carriage return and line feed, '\r\n')
-                    "\u000d",  # (carriage return, '\r')
-                    "\r",  # (carriage return, '\r')
-                    "\u000d",  # (carriage return, '\r')
-                    "\u2028",  # (line separator)
-                    "\u2029",  # (paragraph separator)
-                    # "\u0085", # (next line)
-                ]
-
-                # The tokenization is not identical for the character "\u0085" (next line). The slow version using ftfy transforms
-                # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
-                # space (and thus into an empty list).
-
-                for unicode_seq in line_break_unicodes:
-                    text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
-                    text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
-
-                    self.assertListEqual(text_tokenized_s, text_tokenized_r)
-
-    def test_offsets_mapping_with_different_add_prefix_space_argument(self):
-        # Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space`
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                text_of_1_token = "hello"  # `hello` is a token in the vocabulary of `pretrained_name`
-                text = f"{text_of_1_token} {text_of_1_token}"
-
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                    pretrained_name,
-                    use_fast=True,
-                )
-                encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
-                self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
-                self.assertEqual(
-                    encoding.offset_mapping[1],
-                    (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
-                )
-
-                text = f" {text}"
-
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                    pretrained_name,
-                    use_fast=True,
-                )
-                encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
-                self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
-                self.assertEqual(
-                    encoding.offset_mapping[1],
-                    (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
-                )
-
-    def test_log_warning(self):
-        # Test related to the breaking change introduced in transformers v4.17.0
-        # We need to check that an error in raised when the user try to load a previous version of the tokenizer.
-        with self.assertRaises(ValueError) as context:
-            self.rust_tokenizer_class.from_pretrained("robot-test/old-clip-tokenizer")
-
-        self.assertTrue(
-            context.exception.args[0].startswith(
-                "The `backend_tokenizer` provided does not match the expected format."
-            )
-        )
-
-    @require_ftfy
-    def test_tokenization_python_rust_equals(self):
-        super().test_tokenization_python_rust_equals()
-
-    @unittest.skip(reason="CLIP always lower cases letters")
-    def test_added_tokens_do_lower_case(self):
-        pass
\ No newline at end of file
diff --git a/tests/transformers/models/clipseg/__init__.py b/tests/transformers/models/clipseg/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/clipseg/test_modeling_clipseg.py b/tests/transformers/models/clipseg/test_modeling_clipseg.py
deleted file mode 100644
index 0352dbe3a..000000000
--- a/tests/transformers/models/clipseg/test_modeling_clipseg.py
+++ /dev/null
@@ -1,627 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the mindspore CLIPSeg model."""
-
-import inspect
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import requests
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import (
-        CLIPSegForImageSegmentation, 
-        CLIPSegModel, 
-        CLIPSegTextModel, 
-        CLIPSegVisionModel,
-        CLIPSegTextConfig,
-        CLIPSegVisionConfig,
-        CLIPSegConfig,
-        CLIPSegProcessor
-    )
-    from mindnlp.transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES
-
-
-if is_vision_available():
-    from PIL import Image
-
-class CLIPSegVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return CLIPSegVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = CLIPSegVisionModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class CLIPSegVisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as CLIPSeg does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (CLIPSegVisionModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = CLIPSegVisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=CLIPSegVisionConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="CLIPSeg does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="SegFormer does not have get_input_embeddings method and get_output_embeddings methods")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip
-    def test_training(self):
-        pass
-
-    @unittest.skip
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-    
-    def test_model_outputs_equivalence(self):
-        pass
-    @unittest.skip(reason="CLIPSegVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="CLIPSegVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "CIDAS/clipseg-rd64-refined"
-        model = CLIPSegVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class CLIPSegTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[int(batch_idx), :int(start_index)] = 1
-                input_mask[int(batch_idx), int(start_index):] = 0
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return CLIPSegTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = CLIPSegTextModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (CLIPSegTextModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_head_masking = False
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = CLIPSegTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CLIPSegTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip
-    def test_training(self):
-        pass
-
-    @unittest.skip
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="CLIPSeg does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="CLIPSegTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="CLIPSegTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "CIDAS/clipseg-rd64-refined"
-        model = CLIPSegTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class CLIPSegModelTester:
-    def __init__(
-        self,
-        parent,
-        text_kwargs=None,
-        vision_kwargs=None,
-        is_training=True,
-        # This should respect the `num_hidden_layers` in `CLIPSegVisionModelTester`
-        extract_layers=(1,),
-    ):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = CLIPSegTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = CLIPSegVisionModelTester(parent, **vision_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.is_training = is_training
-        self.extract_layers = extract_layers
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return CLIPSegConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(),
-            self.vision_model_tester.get_config(),
-            projection_dim=64,
-            reduce_dim=32,
-            extract_layers=self.extract_layers,
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = CLIPSegModel(config).set_train(False)
-        result = model(input_ids, pixel_values, attention_mask)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def create_and_check_model_for_image_segmentation(self, config, input_ids, attention_maks, pixel_values):
-        model = CLIPSegForImageSegmentation(config).set_train(False)
-        result = model(input_ids, pixel_values)
-        self.parent.assertEqual(
-            result.logits.shape,
-            (
-                self.vision_model_tester.batch_size,
-                self.vision_model_tester.image_size,
-                self.vision_model_tester.image_size,
-            ),
-        )
-        self.parent.assertEqual(
-            result.conditional_embeddings.shape, (self.text_model_tester.batch_size, config.projection_dim)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class CLIPSegModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (CLIPSegModel, CLIPSegForImageSegmentation) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"feature-extraction": CLIPSegModel} if is_mindspore_available() else {}
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        # CLIPSegForImageSegmentation requires special treatment
-        if return_labels:
-            if model_class.__name__ == "CLIPSegForImageSegmentation":
-                batch_size, _, height, width = inputs_dict["pixel_values"].shape
-                inputs_dict["labels"] = ops.zeros(
-                    batch_size, height, width, dtype=mindspore.float32
-                )
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = CLIPSegModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_for_image_segmentation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_for_image_segmentation(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="CLIPSegModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    def test_model_outputs_equivalence(self):
-        pass
-
-    @unittest.skip("SegFormer does not have get_input_embeddings method and get_output_embeddings methods")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    # override as the some parameters require custom initialization
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if "logit_scale" in name:
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif "film" in name or "transposed_conv" in name or "reduce" in name:
-                        # those parameters use Mindspore' default nn.Linear initialization scheme
-                        pass
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save CLIPSegConfig and check if we can load CLIPSegVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = CLIPSegVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save CLIPSegConfig and check if we can load CLIPSegTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = CLIPSegTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    # overwrite from common since FlaxCLIPSegModel returns nested output
-    # which is not supported in the common test
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="Training test is skipped as the model was not trained")
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            if model_class.__name__ in MODEL_MAPPING_NAMES.values():
-                continue
-
-            print("Model class:", model_class)
-
-            model = model_class(config)
-            model.set_train(False)
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            for k, v in inputs.items():
-                print(k, v.shape)
-            loss = model(**inputs).loss
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "CIDAS/clipseg-rd64-refined"
-        model = CLIPSegModel.from_pretrained(model_name, ignore_mismatched_sizes=True)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-    return image
-
-
-@require_vision
-@require_mindspore
-class CLIPSegModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_image_segmentation(self):
-        model_name = "CIDAS/clipseg-rd64-refined"
-        processor = CLIPSegProcessor.from_pretrained(model_name, ignore_mismatched_sizes=True)
-        model = CLIPSegForImageSegmentation.from_pretrained(model_name,ignore_mismatched_sizes=True)
-
-        image = prepare_img()
-        texts = ["a cat", "a remote", "a blanket"]
-        inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the predicted masks
-        self.assertEqual(
-            outputs.logits.shape,
-            (3, 352, 352),
-        )
-        expected_masks_slice = mindspore.Tensor(
-            [[-7.4613, -7.4785, -7.3628], [-7.3268, -7.0899, -7.1333], [-6.9838, -6.7900, -6.8913]]
-        )
-        self.assertTrue(np.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3))
-
-        # verify conditional and pooled output
-        expected_conditional = mindspore.Tensor([0.5601, -0.0314, 0.1980])
-        expected_pooled_output = mindspore.Tensor([0.5036, -0.2681, -0.2644])
-        self.assertTrue(np.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3))
-        self.assertTrue(np.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3))
diff --git a/tests/transformers/models/clvp/__init__.py b/tests/transformers/models/clvp/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/clvp/test_feature_extraction_clvp.py b/tests/transformers/models/clvp/test_feature_extraction_clvp.py
deleted file mode 100644
index 0732c5c20..000000000
--- a/tests/transformers/models/clvp/test_feature_extraction_clvp.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import itertools
-import os
-import random
-import tempfile
-import unittest
-
-import numpy as np
-from datasets import Audio, load_dataset
-
-from mindnlp.transformers import ClvpFeatureExtractor
-from mindnlp.utils.testing_utils import check_json_file_has_correct_format, require_mindspore, is_mindspore_available, slow
-#from transformers.utils.import_utils import is_torch_available
-
-from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-global_rng = random.Random()
-
-
-# Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.floats_list
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-
-@require_mindspore
-class ClvpFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        min_seq_length=400,
-        max_seq_length=2000,
-        feature_size=10,
-        hop_length=160,
-        chunk_length=8,
-        padding_value=0.0,
-        sampling_rate=4_000,
-        return_attention_mask=False,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.min_seq_length = min_seq_length
-        self.max_seq_length = max_seq_length
-        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
-        self.padding_value = padding_value
-        self.sampling_rate = sampling_rate
-        self.return_attention_mask = return_attention_mask
-        self.feature_size = feature_size
-        self.chunk_length = chunk_length
-        self.hop_length = hop_length
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "feature_size": self.feature_size,
-            "hop_length": self.hop_length,
-            "chunk_length": self.chunk_length,
-            "padding_value": self.padding_value,
-            "sampling_rate": self.sampling_rate,
-            "return_attention_mask": self.return_attention_mask,
-        }
-
-    # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester.prepare_inputs_for_common
-    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
-        def _flatten(list_of_lists):
-            return list(itertools.chain(*list_of_lists))
-
-        if equal_length:
-            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
-        else:
-            # make sure that inputs increase in size
-            speech_inputs = [
-                floats_list((x, self.feature_size))
-                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
-            ]
-        if numpify:
-            speech_inputs = [np.asarray(x) for x in speech_inputs]
-        return speech_inputs
-
-
-@require_mindspore
-class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = ClvpFeatureExtractor
-
-    def setUp(self):
-        self.feat_extract_tester = ClvpFeatureExtractionTester(self)
-
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        
-
-    # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_from_and_save_pretrained
-    def test_feat_extract_from_and_save_pretrained(self):
-        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
-            check_json_file_has_correct_format(saved_file)
-            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
-
-        dict_first = feat_extract_first.to_dict()
-        dict_second = feat_extract_second.to_dict()
-        mel_1 = feat_extract_first.mel_filters
-        mel_2 = feat_extract_second.mel_filters
-        self.assertTrue(np.allclose(mel_1, mel_2))
-        self.assertEqual(dict_first, dict_second)
-
-    # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_to_json_file
-    def test_feat_extract_to_json_file(self):
-        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
-            feat_extract_first.to_json_file(json_file_path)
-            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
-
-        dict_first = feat_extract_first.to_dict()
-        dict_second = feat_extract_second.to_dict()
-        mel_1 = feat_extract_first.mel_filters
-        mel_2 = feat_extract_second.mel_filters
-        self.assertTrue(np.allclose(mel_1, mel_2))
-        self.assertEqual(dict_first, dict_second)
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        # create three inputs of length 800, 1000, and 1200
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        # Test feature size
-        input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features
-        self.assertTrue(input_features.ndim == 3)
-        self.assertTrue(input_features.shape[-2] == feature_extractor.feature_size)
-
-        # Test not batched input
-        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
-        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
-
-        # Test batched
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-        # Test 2-D numpy arrays are batched.
-        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
-        np_speech_inputs = np.asarray(speech_inputs)
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-        # Test truncation required
-        speech_inputs = [floats_list((1, x))[0] for x in range(200, (feature_extractor.n_samples + 500), 200)]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        speech_inputs_truncated = [x[: feature_extractor.n_samples] for x in speech_inputs]
-        np_speech_inputs_truncated = [np.asarray(speech_input) for speech_input in speech_inputs_truncated]
-
-        encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs_truncated, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-    # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_double_precision_pad
-    def test_double_precision_pad(self):
-        import mindspore
-
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
-        py_speech_inputs = np_speech_inputs.tolist()
-
-        for inputs in [py_speech_inputs, np_speech_inputs]:
-            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
-            self.assertTrue(np_processed.input_features.dtype == np.float32)
-            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="ms")
-            self.assertTrue(pt_processed.input_features.dtype == mindspore.float32)
-
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        ds = ds.cast_column("audio", Audio(sampling_rate=22050))
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
-
-    @slow
-    def test_integration(self):
-        # fmt: off
-        EXPECTED_INPUT_FEATURES = mindspore.Tensor(
-            [
-                0.9271, 1.1405, 1.4419, 1.2470, 1.2438, 1.1787, 1.0595, 1.0570, 1.1070,
-                1.2205, 1.2376, 1.2997, 1.1131, 1.0843, 1.0459, 1.1858, 1.2323, 1.3582,
-                1.3401, 1.3770, 1.4173, 1.3381, 1.2291, 1.0854, 1.2116, 1.1873, 1.2178,
-                1.2137, 1.3001, 1.4274
-            ]
-        )
-        # fmt: on
-
-        input_speech, sr = self._load_datasamples(1)
-
-        feature_extractor = ClvpFeatureExtractor.from_pretrained("susnato/clvp_dev")
-        input_features = feature_extractor(input_speech, sampling_rate=sr[0], return_tensors="ms").input_features
-        self.assertEqual(input_features.shape, (1, 80, 517))
-        self.assertTrue(np.allclose(input_features[0, 0, :30].asnumpy(), EXPECTED_INPUT_FEATURES.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/clvp/test_modeling_clvp.py b/tests/transformers/models/clvp/test_modeling_clvp.py
deleted file mode 100644
index 6b925b704..000000000
--- a/tests/transformers/models/clvp/test_modeling_clvp.py
+++ /dev/null
@@ -1,637 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Clvp model."""
-
-import gc
-import tempfile
-import unittest
-
-import datasets
-import numpy as np
-
-from mindnlp.transformers import ClvpConfig, ClvpDecoderConfig, ClvpEncoderConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-    is_mindspore_available,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    ids_tensor,
-    random_attention_mask,
-)
-
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import ClvpEncoder, ClvpForCausalLM, ClvpModel, ClvpModelForConditionalGeneration
-
-from mindnlp.transformers import ClvpFeatureExtractor, ClvpTokenizer
-
-
-class ClvpEncoderTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        seq_length=7,
-        is_training=False,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=50,
-        hidden_size=128,
-        projection_dim=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=32,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-
-    def get_config(self):
-        encoder_config = ClvpEncoderConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-        )
-
-        return encoder_config
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[batch_idx, :int(start_index)] = 1
-                input_mask[batch_idx, int(start_index):] = 0
-
-        encoder_config = self.get_config()
-
-        return encoder_config, input_ids, input_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        speech_config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return speech_config, inputs_dict
-
-    def create_and_check_model(self, speech_config, input_ids, input_mask):
-        text_config = ClvpEncoderConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-        text_encoder_model = ClvpEncoder(config=text_config)
-        #text_encoder_model.to(torch_device)
-        text_encoder_model.set_train(False)
-        #with mindspore._no_grad():
-        result = text_encoder_model(input_ids, attention_mask=input_mask)
-        result = text_encoder_model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.projection_dim))
-
-        # now check with speech config
-        speech_encoder_model = ClvpEncoder(config=speech_config)
-        #speech_encoder_model.to(torch_device)
-        speech_encoder_model.set_train(False)
-        #with mindspore._no_grad():
-        result = speech_encoder_model(input_ids, attention_mask=input_mask)
-        result = speech_encoder_model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.projection_dim))
-
-
-@require_mindspore
-class ClvpEncoderTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = [ClvpEncoder,] if is_mindspore_available() else ()
-    test_pruning = False
-    test_head_masking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = ClvpEncoderTester(self)
-        self.encoder_config_tester = ConfigTester(self, config_class=ClvpEncoderConfig, hidden_size=32)
-
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        
-
-    def test_config(self):
-        self.encoder_config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="ClvpEncoder does not output loss")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="ClvpEncoder does not output loss")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-
-class ClvpDecoderTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        seq_length=3,
-        is_training=False,
-        vocab_size=300,
-        max_position_embeddings=256,
-        max_text_tokens=256,
-        use_input_mask=True,
-        hidden_size=128,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        bos_token_id=97,
-        eos_token_id=98,
-        relative_attention_num_buckets=4,
-        relative_attention_max_distance=16,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.max_text_tokens = max_text_tokens
-        self.use_input_mask = use_input_mask
-        self.hidden_size = hidden_size
-        self.num_attention_heads = num_attention_heads
-        self.num_hidden_layers = num_hidden_layers
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.relative_attention_max_distance = relative_attention_max_distance
-
-    def get_config(self):
-        decoder_config = ClvpDecoderConfig(
-            vocab_size=self.vocab_size,
-            max_position_embeddings=self.max_position_embeddings,
-            max_text_tokens=self.max_text_tokens,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            relative_attention_max_distance=self.relative_attention_max_distance,
-        )
-
-        return decoder_config
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                ops.scatter_nd_update(input_mask,
-                                      ops.stack([ops.full((int(start_index),), batch_idx), ops.arange(mindspore.Tensor(start_index))], dim=1),
-                                      ops.full((int(start_index),), 1))
-                ops.scatter_nd_update(input_mask,
-                                      ops.stack([ops.full((input_mask.shape[1] - int(start_index),), batch_idx), ops.arange(mindspore.Tensor(input_mask.shape[1] - start_index))], dim=1),
-                                      ops.full((input_mask.shape[1] - int(start_index),), 0))
-
-        decoder_config = self.get_config()
-
-        return decoder_config, input_ids, input_mask
-
-    def create_and_check_model(self, config, input_ids, attention_mask):
-        model = ClvpForCausalLM(config).set_train(False)
-        #with mindspore._no_grad():
-        result = model(input_ids=input_ids, attention_mask=attention_mask)
-
-        self.parent.assertEqual(result[0].shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class ClvpDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = [ClvpForCausalLM] if is_mindspore_available() else ()
-    all_generative_model_classes = (ClvpForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"feature-extraction": ClvpModelForConditionalGeneration} if is_mindspore_available() else {}
-
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = ClvpDecoderTester(self)
-        self.decoder_config_tester = ConfigTester(self, config_class=ClvpDecoderConfig, hidden_size=32)
-
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        if return_labels and model_class == ClvpForCausalLM:
-            inputs_dict["labels"] = ops.zeros(
-                (self.model_tester.batch_size, self.model_tester.seq_length)
-            ).long()
-
-        return inputs_dict
-
-    def test_training(self):
-        # we will only test the ClvpForCausalLM since it outputs loss
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        model = ClvpForCausalLM(config)
-        #model.to(torch_device)
-        model.set_train()
-        inputs = self._prepare_for_class(inputs_dict, ClvpForCausalLM, return_labels=True)
-        loss = model(**inputs).loss
-        #loss.backward()
-
-    def test_training_gradient_checkpointing(self):
-        # we will only test the ClvpForCausalLM since it outputs loss
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.use_cache = False
-        config.return_dict = True
-
-        model = ClvpForCausalLM(config)
-        #model.to(torch_device)
-        #model.gradient_checkpointing_enable()
-        model.set_train()
-        inputs = self._prepare_for_class(inputs_dict, ClvpForCausalLM, return_labels=True)
-
-        loss = model(**inputs).loss
-        #loss.backward()
-
-
-class ClvpModelForConditionalGenerationTester:
-    def __init__(self, parent, is_training=False):
-        self.parent = parent
-        self.clvp_encoder_tester = ClvpEncoderTester(parent)
-        self.is_training = is_training
-        self.batch_size = self.clvp_encoder_tester.batch_size  # need bs for batching_equivalence test
-
-    def get_config(self):
-        decoder_config = ClvpDecoderConfig(
-            vocab_size=50,
-            max_position_embeddings=30,
-            max_text_tokens=30,
-            hidden_size=128,
-            num_hidden_layers=1,
-            num_attention_heads=2,
-            bos_token_id=97,
-            eos_token_id=98,
-            relative_attention_num_buckets=4,
-            relative_attention_max_distance=16,
-        )
-        text_config = self.clvp_encoder_tester.get_config()
-        speech_config = self.clvp_encoder_tester.get_config()
-        speech_config.vocab_size = 300
-
-        return ClvpConfig.from_sub_model_configs(
-            text_config,
-            speech_config,
-            decoder_config,
-            projection_dim=16,
-        )
-
-    def prepare_config_and_inputs(self):
-        _, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs()
-
-        ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
-        _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
-
-        feature_extractor = ClvpFeatureExtractor()
-        input_features = feature_extractor(raw_speech=audio, sampling_rate=sr, return_tensors="ms")[
-            "input_features"
-        ]
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, input_features
-
-    def create_and_check_model(self, config, input_ids, attention_mask, input_features):
-        model = ClvpModelForConditionalGeneration(config).set_train(False)
-        #with mindspore._no_grad():
-        result = model(input_ids=input_ids, input_features=input_features, attention_mask=attention_mask)
-
-        self.parent.assertEqual(result.logits_per_speech.shape, (2, self.clvp_encoder_tester.batch_size))
-        self.parent.assertEqual(result.logits_per_text.shape, (self.clvp_encoder_tester.batch_size, 2))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, input_features = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "input_features": input_features,
-            "return_loss": False,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = [ClvpModelForConditionalGeneration,] if is_mindspore_available() else ()
-
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = ClvpModelForConditionalGenerationTester(self)
-        self.clvp_config_tester = ConfigTester(self, config_class=ClvpConfig, hidden_size=32)
-
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):   
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            #model.to(torch_device)
-            model.set_train(False)
-
-            #with mindspore._no_grad():
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # check for decoder model, text encoder model and speech encoder model hidden states
-            decoder_hidden_states = outputs.decoder_hidden_states
-            text_encoder_hidden_states = outputs.text_encoder_hidden_states
-            speech_encoder_hidden_states = outputs.speech_encoder_hidden_states
-
-            # check length of the hidden states
-            expected_decoder_num_layers = config.decoder_config.num_hidden_layers + 1
-            self.assertEqual(len(decoder_hidden_states), expected_decoder_num_layers)
-
-            expected_speech_encoder_num_layers = config.text_config.num_hidden_layers + 1
-            self.assertEqual(len(text_encoder_hidden_states), expected_speech_encoder_num_layers)
-
-            expected_text_encoder_num_layers = config.speech_config.num_hidden_layers + 1
-            self.assertEqual(len(speech_encoder_hidden_states), expected_text_encoder_num_layers)
-
-            # check shapes of each hidden state
-
-            # for the decoder model we will only test the dimension because the ClvpConditioningEncoder could increase
-            # the sequence lengths.
-            self.assertEqual(decoder_hidden_states[0].shape[-1], config.decoder_config.hidden_size)
-
-            # the testing for text encoder stays standard because we just pass the text tokens here.
-            self.assertListEqual(
-                list(text_encoder_hidden_states[0].shape[-2:]),
-                [self.model_tester.clvp_encoder_tester.seq_length, config.text_config.hidden_size],
-            )
-
-            # for the decoder model we will only test the dimension because the fix_decoder_outputs method could increase
-            # the sequence lengths by adding `decoder_fixing_codes` tokens at the end.
-            self.assertEqual(speech_encoder_hidden_states[0].shape[-1], config.speech_config.hidden_size)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="ClvpModelForConditionalGeneration does not have get_input_embeddings")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="ClvpModelForConditionalGeneration does not have get_input_embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # override as the `logit_scale` parameter initilization is different for Clvp
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        expected_value = np.log(1 / 0.07)
-                        returned_value = param.data.item()
-
-                        self.assertAlmostEqual(
-                            returned_value,
-                            expected_value,
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        expected_range = [0.0, 1.0]
-                        returned_range = ((param.data.mean() * 1e9).round() / 1e9).item()
-
-                        self.assertIn(
-                            returned_range,
-                            expected_range,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def test_load_speech_text_decoder_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save ClvpConfig and check if we can load ClvpEncoderConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            encoder_config = ClvpEncoderConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), encoder_config.to_dict())
-
-        # Save ClvpConfig and check if we can load ClvpDecoderConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            decoder_config = ClvpDecoderConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.decoder_config.to_dict(), decoder_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "susnato/clvp_dev"
-        model = ClvpModelForConditionalGeneration.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# Since Clvp has a lot of different models connected with each other it's better to test each of them individually along
-# with a test_full_model_integration. If the model breaks in future, it could be of a great help to identify the broken part.
-
-
-@slow
-@require_mindspore
-class ClvpIntegrationTest(unittest.TestCase):
-    def setUp(self):
-        self.text = "This is an example text."
-        ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
-        _, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values()
-
-        self.model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")
-        self.model.set_train(False)
-        tokenizer = ClvpTokenizer.from_pretrained("susnato/clvp_dev")
-        feature_extractor = ClvpFeatureExtractor.from_pretrained("susnato/clvp_dev")
-
-        tokenizer_output = tokenizer(self.text, return_tensors="ms")
-        self.text_tokens = tokenizer_output["input_ids"]
-        self.input_features = feature_extractor(
-            raw_speech=self.speech_samples, sampling_rate=self.sr, return_tensors="ms"
-        )["input_features"]
-
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        
-
-    def test_conditional_encoder(self):
-        #with mindspore._no_grad():
-        conditioning_encoder_outputs = self.model.conditioning_encoder(
-            input_features=self.input_features, input_ids=self.text_tokens
-        )
-
-        self.assertEqual(
-            conditioning_encoder_outputs.shape,
-            (self.input_features.shape[0], 18, self.model.config.decoder_config.hidden_size),
-        )
-
-        EXPECTED_OUTPUTS = mindspore.Tensor(
-            [[-0.8582, 0.5228, 1.9944], [-0.0465, -1.1017, -0.0093], [-0.0466, -0.6030, -0.1280]]
-        )
-
-        self.assertTrue(np.allclose(conditioning_encoder_outputs[0, :3, :3].asnumpy(), EXPECTED_OUTPUTS.asnumpy(), atol=1e-4))
-
-    def test_decoder_model_generate(self):
-        autoregressive_model_output = self.model.speech_decoder_model.generate(input_ids=self.text_tokens)
-        
-        EXPECTED_OUTPUTS = mindspore.Tensor([[147, 2, 54, 2, 43, 2, 169, 122, 29, 64, 2, 136, 37, 33, 9, 8193]])
-        
-        self.assertTrue(np.allclose(autoregressive_model_output.asnumpy(), EXPECTED_OUTPUTS.asnumpy()))
-
-    def test_text_and_speech_encoder_models(self):
-        # check for text embeds
-        text_embeds = self.model.text_encoder_model(input_ids=self.text_tokens, return_dict=True)[0]
-
-        # fmt: off
-        EXPECTED_TEXT_EMBEDS = mindspore.Tensor([1.4798, -2.0005, 2.3902, -0.5042, 1.6401, -2.4135, -1.4800, 3.0118, -2.4422, 1.3266, 2.2339, 1.4761, -4.8983, -1.3592, 6.0251, 6.7364, 2.2576, 3.7229, -10.0436, 4.6676])
-        # fmt: on
-
-        self.assertTrue(np.allclose(text_embeds[0, :20].asnumpy(), EXPECTED_TEXT_EMBEDS.asnumpy(), atol=1e-4))
-
-        # check for speech embeds
-        speech_embeds = self.model.speech_encoder_model(input_ids=self.text_tokens, return_dict=True)[0]
-
-        # fmt: off
-        EXPECTED_SPEECH_EMBEDS = mindspore.Tensor([3.1202, -3.1183, -1.4264, -6.1339, 1.8885, -0.1983, 0.9461, -1.7414, 0.3320, -3.8400, -1.5715, 1.5096, -1.7576, 0.2387, 4.9758, 5.8450, -6.2534, 2.8587, -5.5816, 4.7821])
-        # fmt: on
-
-        self.assertTrue(np.allclose(speech_embeds[0, :20].asnumpy(), EXPECTED_SPEECH_EMBEDS.asnumpy(), atol=1e-4))
-
-    def test_full_model_integration(self):
-        full_model_output = self.model.generate(
-            input_ids=self.text_tokens,
-            input_features=self.input_features,
-            do_sample=False,
-            num_beams=4,
-            num_return_sequences=4,
-            max_new_tokens=10,
-        )
-
-        EXPECTED_SPEECH_IDS = mindspore.Tensor([[1953, 1080, 612], [1953, 612, 493], [1953, 612, 716]])
-        EXPECTED_SIMILARITY_SCORES = mindspore.Tensor([[14.7660, 14.4569, 13.6472, 13.5683]])
-
-        self.assertTrue(np.allclose(full_model_output.speech_ids[-3:, -3:].asnumpy(), EXPECTED_SPEECH_IDS.asnumpy()))
-        self.assertTrue(np.allclose(full_model_output.logits_per_text.asnumpy(), EXPECTED_SIMILARITY_SCORES.asnumpy()))
diff --git a/tests/transformers/models/clvp/test_processor_clvp.py b/tests/transformers/models/clvp/test_processor_clvp.py
deleted file mode 100644
index 72adac518..000000000
--- a/tests/transformers/models/clvp/test_processor_clvp.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import gc
-import shutil
-import tempfile
-import unittest
-
-from mindnlp.transformers import ClvpFeatureExtractor, ClvpProcessor, ClvpTokenizer
-from mindnlp.utils.testing_utils import require_mindspore, is_mindspore_available
-
-from .test_feature_extraction_clvp import floats_list
-
-
-@require_mindspore
-class ClvpProcessorTest(unittest.TestCase):
-    def setUp(self):
-        self.checkpoint = "susnato/clvp_dev"
-        self.tmpdirname = tempfile.mkdtemp()
-
-    def tearDown(self):
-        super().tearDown()
-        shutil.rmtree(self.tmpdirname)
-        gc.collect()
-
-    # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.get_tokenizer with Whisper->Clvp
-    def get_tokenizer(self, **kwargs):
-        return ClvpTokenizer.from_pretrained(self.checkpoint, **kwargs)
-
-    # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.get_feature_extractor with Whisper->Clvp
-    def get_feature_extractor(self, **kwargs):
-        return ClvpFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
-
-    # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_save_load_pretrained_default with Whisper->Clvp
-    def test_save_load_pretrained_default(self):
-        tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        processor.save_pretrained(self.tmpdirname)
-        processor = ClvpProcessor.from_pretrained(self.tmpdirname)
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertIsInstance(processor.tokenizer, ClvpTokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, ClvpFeatureExtractor)
-
-    # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_feature_extractor with Whisper->Clvp,processor(raw_speech->processor(raw_speech=raw_speech
-    def test_feature_extractor(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        raw_speech = floats_list((3, 1000))
-
-        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
-        input_processor = processor(raw_speech=raw_speech, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_tokenizer with Whisper->Clvp
-    def test_tokenizer(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        input_str = "This is a test string"
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tok = tokenizer(input_str)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    # Copied from transformers.tests.models.whisper.test_processor_whisper.WhisperProcessorTest.test_tokenizer_decode with Whisper->Clvp
-    def test_tokenizer_decode(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = ClvpProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(pad_token="(PAD)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(sampling_rate=16000)
-
-        processor = ClvpProcessor.from_pretrained(
-            self.tmpdirname,
-            pad_token="(PAD)",
-            sampling_rate=16000,
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, ClvpTokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, ClvpFeatureExtractor)
-
-    def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = ClvpProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        self.assertListEqual(
-            sorted(processor.model_input_names),
-            sorted(set(feature_extractor.model_input_names + tokenizer.model_input_names)),
-            msg="`processor` and `feature_extractor` model input names do not match",
-        )
diff --git a/tests/transformers/models/clvp/test_tokenization_clvp.py b/tests/transformers/models/clvp/test_tokenization_clvp.py
deleted file mode 100644
index 16138d9d8..000000000
--- a/tests/transformers/models/clvp/test_tokenization_clvp.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import unittest
-from typing import List
-
-from mindnlp.transformers import ClvpTokenizer
-
-from ...test_tokenization_common import TokenizerTesterMixin, slow
-
-
-class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "susnato/clvp_dev"
-    tokenizer_class = ClvpTokenizer
-    test_rust_tokenizer = False
-    from_pretrained_kwargs = {"add_prefix_space": True}
-    test_seq2seq = False
-    test_sentencepiece_ignore_case = True
-
-    def setUp(self):
-        super().setUp()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "\u0120",
-            "\u0120l",
-            "\u0120n",
-            "\u0120lo",
-            "\u0120low",
-            "er",
-            "\u0120lowest",
-            "\u0120newer",
-            "\u0120wider",
-            "<unk>",
-            "<|endoftext|>",
-            "[SPACE]",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.vocab_file = os.path.join(self.tmpdirname, "vocab.json")
-        self.merges_file = os.path.join(self.tmpdirname, "merges.txt")
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_tokenizer with GPT2->Clvp
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return ClvpTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
-    def get_input_output_texts(self, tokenizer):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    # Copied from transformers.tests.models.layoutxlm.test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_add_special_tokens
-    def test_add_special_tokens(self):
-        tokenizers: List[ClvpTokenizer] = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                special_token = "[SPECIAL_TOKEN]"
-                special_token_box = [1000, 1000, 1000, 1000]
-
-                tokenizer.add_special_tokens({"cls_token": special_token})
-                encoded_special_token = tokenizer.encode(
-                    [special_token], boxes=[special_token_box], add_special_tokens=False
-                )
-                self.assertEqual(len(encoded_special_token), 1)
-
-                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
-                self.assertTrue(special_token not in decoded)
-
-    # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_rust_and_python_full_tokenizers
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
-
-        sequence = "lower newer"
-
-        # Testing tokenization
-        tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        # Testing conversion to ids without special tokens
-        ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        # Testing conversion to ids with special tokens
-        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
-        ids = tokenizer.encode(sequence, add_prefix_space=True)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        # Testing the unknown token
-        input_tokens = tokens + [rust_tokenizer.unk_token]
-        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_padding
-    def test_padding(self, max_length=15):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                # Simple input
-                s = "This is a simple input"
-                s2 = ["This is a simple input 1", "This is a simple input 2"]
-                p = ("This is a simple input", "This is a pair")
-                p2 = [
-                    ("This is a simple input 1", "This is a simple input 2"),
-                    ("This is a simple pair 1", "This is a simple pair 2"),
-                ]
-
-                # Simple input tests
-                self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
-
-                # Simple input
-                self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
-
-                # Simple input
-                self.assertRaises(
-                    ValueError,
-                    tokenizer_r.batch_encode_plus,
-                    s2,
-                    max_length=max_length,
-                    padding="max_length",
-                )
-
-                # Pair input
-                self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
-
-                # Pair input
-                self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
-
-                # Pair input
-                self.assertRaises(
-                    ValueError,
-                    tokenizer_r.batch_encode_plus,
-                    p2,
-                    max_length=max_length,
-                    padding="max_length",
-                )
-
-    # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_padding_if_pad_token_set_slow
-    def test_padding_if_pad_token_set_slow(self):
-        tokenizer = ClvpTokenizer.from_pretrained(self.tmpdirname, pad_token="<pad>")
-
-        # Simple input
-        s = "This is a simple input"
-        s2 = ["This is a simple input looooooooong", "This is a simple input"]
-        p = ("This is a simple input", "This is a pair")
-        p2 = [
-            ("This is a simple input loooooong", "This is a simple input"),
-            ("This is a simple pair loooooong", "This is a simple pair"),
-        ]
-
-        pad_token_id = tokenizer.pad_token_id
-
-        out_s = tokenizer(s, padding="max_length", max_length=30, return_tensors="np")
-        out_s2 = tokenizer(s2, padding=True, truncate=True, return_tensors="np")
-        out_p = tokenizer(*p, padding="max_length", max_length=60, return_tensors="np")
-        out_p2 = tokenizer(p2, padding=True, truncate=True, return_tensors="np")
-
-        # s
-        # test single string max_length padding
-        self.assertEqual(out_s["input_ids"].shape[-1], 30)
-        self.assertTrue(pad_token_id in out_s["input_ids"])
-        self.assertTrue(0 in out_s["attention_mask"])
-
-        # s2
-        # test automatic padding
-        self.assertEqual(out_s2["input_ids"].shape[-1], 33)
-        # long slice doesn't have padding
-        self.assertFalse(pad_token_id in out_s2["input_ids"][0])
-        self.assertFalse(0 in out_s2["attention_mask"][0])
-        # short slice does have padding
-        self.assertTrue(pad_token_id in out_s2["input_ids"][1])
-        self.assertTrue(0 in out_s2["attention_mask"][1])
-
-        # p
-        # test single pair max_length padding
-        self.assertEqual(out_p["input_ids"].shape[-1], 60)
-        self.assertTrue(pad_token_id in out_p["input_ids"])
-        self.assertTrue(0 in out_p["attention_mask"])
-
-        # p2
-        # test automatic padding pair
-        self.assertEqual(out_p2["input_ids"].shape[-1], 52)
-        # long slice pair doesn't have padding
-        self.assertFalse(pad_token_id in out_p2["input_ids"][0])
-        self.assertFalse(0 in out_p2["attention_mask"][0])
-        # short slice pair does have padding
-        self.assertTrue(pad_token_id in out_p2["input_ids"][1])
-        self.assertTrue(0 in out_p2["attention_mask"][1])
-
-    # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_special_tokens_mask_input_pairs_and_bos_token
-    def test_special_tokens_mask_input_pairs_and_bos_token(self):
-        # TODO: change to self.get_tokenizers() when the fast version is implemented
-        tokenizers = [self.get_tokenizer(do_lower_case=False, add_bos_token=True)]
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequence_0 = "Encode this."
-                sequence_1 = "This one too please."
-                encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
-                encoded_sequence += tokenizer.encode(sequence_1, add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
-                    sequence_0,
-                    sequence_1,
-                    add_special_tokens=True,
-                    return_special_tokens_mask=True,
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [
-                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
-                ]
-                filtered_sequence = [x for x in filtered_sequence if x is not None]
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    def test_token_type_ids(self):
-        tokenizer = self.get_tokenizer()
-        seq_0 = "Test this method."
-
-        # We want to have sequence 0 and sequence 1 are tagged
-        # respectively with 0 and 1 token_ids
-        # (regardless of whether the model use token type ids)
-        # We use this assumption in the QA pipeline among other place
-        output = tokenizer(seq_0, return_token_type_ids=True, add_special_tokens=True)
-        self.assertIn(0, output["token_type_ids"])
-
-    def test_full_tokenizer(self):
-        tokenizer = ClvpTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "lower newer"
-        bpe_tokens = ["l", "o", "w", "er", "[SPACE]", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text, add_prefix_space=False)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [0, 1, 2, 15, 21, 9, 3, 2, 15, 19]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    @slow
-    def test_outputs_with_numbers(self):
-        text = "hello and this is an example text and I have $1000. my lucky number is 12345."
-        tokenizer = ClvpTokenizer.from_pretrained("susnato/clvp_dev")
-
-        # fmt: off
-        EXPECTED_OUTPUT = [62, 84, 28, 2, 53, 2,147, 2, 54, 2, 43, 2, 169, 122, 29, 64, 2, 136, 37, 33, 2, 53, 2, 22,
-                           2, 148, 2, 110, 2, 40, 206, 53, 2, 134, 84, 59, 32, 9, 2, 125, 2, 25, 34, 197, 38, 2, 27,
-                           231, 15, 44, 2, 54, 2, 33, 100, 25, 76, 2, 40, 206, 53, 7, 2, 40, 46, 18, 2, 21, 97, 17,
-                           219, 2, 87, 210, 8, 19, 22, 76, 9,
-                           ]
-        # fmt: on
-
-        self.assertListEqual(tokenizer.encode(text, add_special_tokens=False), EXPECTED_OUTPUT)
-
-    @slow
-    def test_tokenizer_integration(self):
-        sequences = [
-            "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
-            "general-purpose architectures (BERT, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
-            "Language Understanding (NLU) and Natural Language Generation (NLG) with over multiple pretrained "
-            "models and deep interoperability between Jax, PyTorch and TensorFlow.",
-            "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
-            "conditioning on both left and right context in all layers.",
-            "The quick brown fox jumps over the lazy dog.",
-        ]
-
-        # fmt: off
-        expected_encoding = {'input_ids': [[144,  43,  32,  87,  26, 173,   2,   5,  87,  26,  44,  70,   2, 209, 27,   2,  55,   2,  29,  38,  51,  31,  71,   8, 144,  43,  32,  87, 26, 173,   2,  53,   2,  29,  38,  51,  31,  71,   8,  29,  46, 144, 137,  49,   8,  15,  44,  33,   6,   2, 187,  35,  83,  61,   2,  20, 50,  44,  56,   8,  29, 121, 139,  66,   2,  59,  71,  60,  18,  16, 33,  34, 175,   2,   5,  15,  44,  33,   7,   2,  89,  15,  44,  33, 14,   7,   2,  37,  25,  26,   7,   2,  17,  54,  78,  25,  15,  44, 33,   7,   2,  37,  25, 111,  33,   9,   9,   9,   6,   2,  87,   2, 27,  48, 121,  56,   2,  25,  43,  20,  34,  14, 112,   2,  97, 234, 63,  53,  52,   2,   5,  27,  25,  34,   6,   2,  53,   2,  27,  48, 121,  56,   2,  25,  43,  20,  34,  14, 112,   2,  20,  50,  44, 158, 2,   5,  27,  25,  20,   6,   2, 103,   2, 253,   2,  26, 167,  78, 29,  64,   2,  29,  46, 144, 137,  49,   2, 115, 126,  25,  32,   2, 53,   2, 126,  18,  29,   2,  41, 114, 161,  44, 109, 151, 240,   2, 67,  33, 100,  50,   2,  23,  14,  37,   7,   2,  29,  38,  51,  31, 71,   2,  53,   2,  33,  50,  32,  57,  19,  25,  69,   9], [ 15,  44,  33,   2,  54,   2,  17,  61,  22,  20,  27,  49,   2,  51, 2,  29,  46,   8, 144, 137,   2, 126,  18,  29,   2,  15,  83,  22, 46,  16, 181,  56,   2,  46,  29, 175,  86, 158,  32,   2, 154,   2, 97,  25,  14,  67,  25,  49,   2, 136,  37,  33,   2, 185,   2,  23, 28,  41,  33,  70,   2, 135,  17,  60, 107,  52,   2,  47,   2, 165, 40,   2,  64,  19,  33,   2,  53,   2, 101, 104,   2, 135, 136,  37, 33,   2,  41,   2, 108,   2,  25,  88, 173,   9,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0], [ 42,   2, 194,  91,  24,   2, 243, 190,   2, 182,  37,   2,  23, 231, 29,  32,   2, 253,   2,  42,   2,  25,  14,  39,  38,   2, 134,  20, 9,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]], # noqa: E501
-                             'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # noqa: E501
-                             }
-        # fmt: on
-
-        self.tokenizer_integration_test_util(
-            sequences=sequences, expected_encoding=expected_encoding, model_name="susnato/clvp_dev", padding=True
-        )
-
-    @unittest.skip(reason="AssertionError: 0 not greater than or equal to 1")
-    def test_pretrained_model_lists(self):
-        pass
diff --git a/tests/transformers/models/codegen/__init__.py b/tests/transformers/models/codegen/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/codegen/test_modeling_codegen.py b/tests/transformers/models/codegen/test_modeling_codegen.py
deleted file mode 100644
index 98f173a24..000000000
--- a/tests/transformers/models/codegen/test_modeling_codegen.py
+++ /dev/null
@@ -1,564 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import datetime
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import CodeGenConfig
-from mindnlp.utils import cached_property, is_mindspore_available
-from mindnlp.utils.testing_utils import is_flaky, require_mindspore, slow
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST,
-        AutoTokenizer,
-        CodeGenForCausalLM,
-        CodeGenModel
-    )
-
-class CodeGenModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_token_type_ids=True,
-        use_input_mask=True,
-        use_labels=True,
-        use_mc_token_ids=True,
-        vocab_size=256,
-        hidden_size=32,
-        rotary_dim=4,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.use_mc_token_ids = use_mc_token_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.rotary_dim = rotary_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = None
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-    def get_large_model_config(self):
-        return CodeGenConfig.from_pretrained("Salesforce/codegen-2B-mono")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        mc_token_ids = None
-        if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return CodeGenConfig(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            rotary_dim=self.rotary_dim,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_codegen_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = CodeGenModel(config=config)
-
-        model.set_train(False)
-
-        result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(len(result.past_key_values), config.n_layer)
-
-    def create_and_check_codegen_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = CodeGenModel(config=config)
-
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
-        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_token_type_ids = ops.cat([token_type_ids, next_token_types], axis=-1)
-
-        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_codegen_model_attention_mask_past(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = CodeGenModel(config=config)
-
-        model.set_train(False)
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-        half_seq_length = self.seq_length // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            axis=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_codegen_model_past_large_inputs(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = CodeGenModel(config=config)
-
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_token_type_ids = ops.cat([token_type_ids, next_token_types], axis=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
-        )["last_hidden_state"]
-        output_from_past = model(
-            next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past
-        )["last_hidden_state"]
-        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = CodeGenForCausalLM(config)
-
-        model.set_train(False)
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_forward_and_backwards(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
-    ):
-        model = CodeGenForCausalLM(config)
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
-
-        return config, inputs_dict
-
-
-@require_mindspore
-class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (CodeGenModel, CodeGenForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (CodeGenForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": CodeGenModel, "text-generation": CodeGenForCausalLM} if is_mindspore_available() else {}
-    )
-    fx_compatible = False
-    test_pruning = False
-    test_missing_keys = False
-    test_model_parallel = False
-    test_head_masking = False
-
-    # special case for DoubleHeads model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = CodeGenModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CodeGenConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_codegen_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_codegen_model(*config_and_inputs)
-
-    def test_codegen_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_codegen_model_past(*config_and_inputs)
-
-    def test_codegen_model_att_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_codegen_model_attention_mask_past(*config_and_inputs)
-
-    def test_codegen_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_codegen_model_past_large_inputs(*config_and_inputs)
-
-    def test_codegen_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    def test_codegen_gradient_checkpointing(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
-
-    @slow
-    def test_batch_generation(self):
-        tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
-        model = CodeGenForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")
-
-
-        tokenizer.padding_side = "left"
-
-        # Define PAD Token = EOS Token = 50256
-        tokenizer.pad_token = tokenizer.eos_token
-        model.config.pad_token_id = model.config.eos_token_id
-
-        # use different length sentences to test batching
-        sentences = ["def hellow_world():", "def greet(name):"]
-
-        inputs = tokenizer(sentences, return_tensors="ms", padding=True)
-        input_ids = inputs["input_ids"]
-        token_type_ids = ops.cat(
-            [
-                input_ids.new_zeros((input_ids.shape[0], input_ids.shape[1] - 1)),
-                input_ids.new_ones((input_ids.shape[0], 1)) * 500,
-            ],
-            axis=-1,
-        )
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"],
-        )
-
-        outputs_tt = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"],
-            token_type_ids=token_type_ids,
-        )
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="ms").input_ids
-        output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
-        inputs_padded = tokenizer(sentences[1], return_tensors="ms").input_ids
-        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            'def hellow_world():\n    print("Hello World")\n\nhellow_world()',
-            'def greet(name):\n    print(f"Hello {name}")\n\ng',
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
-        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = CodeGenModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-@require_mindspore
-class CodeGenModelLanguageGenerationTest(unittest.TestCase):
-    @cached_property
-    def cached_tokenizer(self):
-        return AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
-
-    @cached_property
-    def cached_model(self):
-        return CodeGenForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")
-
-    @slow
-    def test_lm_generate_codegen(self):
-        tokenizer = self.cached_tokenizer
-        for checkpointing in [True, False]:
-            model = self.cached_model
-
-            # if checkpointing:
-            #     model.gradient_checkpointing_enable()
-            # else:
-            #     model.gradient_checkpointing_disable()
-    
-
-            inputs = tokenizer("def hello_world():", return_tensors="ms")
-            expected_output = 'def hello_world():\n    print("Hello World")\n\nhello_world()\n\n'
-
-            output_ids = model.generate(**inputs, do_sample=False)
-            output_str = tokenizer.batch_decode(output_ids)[0]
-
-            self.assertEqual(output_str, expected_output)
-
-    @slow
-    def test_codegen_sample(self):
-        tokenizer = self.cached_tokenizer
-        model = self.cached_model
-
-
-        mindspore.set_seed(1234)
-
-        tokenized = tokenizer("def hello_world():", return_tensors="ms", return_token_type_ids=True)
-        input_ids = tokenized.input_ids
-        output_ids = model.generate(input_ids, do_sample=True)
-        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-
-        token_type_ids = tokenized.token_type_ids
-        output_seq = model.generate(input_ids=input_ids, do_sample=True, num_return_sequences=5)
-        output_seq_tt = model.generate(
-            input_ids=input_ids, token_type_ids=token_type_ids, do_sample=True, num_return_sequences=5
-        )
-        output_seq_strs = tokenizer.batch_decode(output_seq, skip_special_tokens=True)
-        output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt, skip_special_tokens=True)
-        EXPECTED_OUTPUT_STR = 'def hello_world():\n    print("Hello World")\n    return True\n\nresult ='
-
-        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
-        self.assertTrue(
-            all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs)))
-        )  # token_type_ids should change output
-
-    @is_flaky(max_attempts=3, description="measure of timing is somehow flaky.")
-    @slow
-    def test_codegen_sample_max_time(self):
-        tokenizer = self.cached_tokenizer
-        model = self.cached_model
-
-
-        mindspore.set_seed(0)
-        tokenized = tokenizer("Today is a nice day and", return_tensors="ms", return_token_type_ids=True)
-        input_ids = tokenized.input_ids
-
-        MAX_TIME = 0.05
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=True, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=2 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=2 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, num_beams=2, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=2 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=True, num_beams=2, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=2 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, max_time=None, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=2 * MAX_TIME))
\ No newline at end of file
diff --git a/tests/transformers/models/cohere/__init__.py b/tests/transformers/models/cohere/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/cohere/test_modeling_cohere.py b/tests/transformers/models/cohere/test_modeling_cohere.py
deleted file mode 100644
index dcf3c6a27..000000000
--- a/tests/transformers/models/cohere/test_modeling_cohere.py
+++ /dev/null
@@ -1,326 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import CohereConfig
-from mindnlp.utils import cached_property, is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import AutoTokenizer, CohereForCausalLM, CohereModel
-
-# Copied from transformers.tests.models.llama.LlamaModelTester with Llama->Cohere
-class CohereModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ops.tril(ops.ones(self.batch_size, self.seq_length))
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    # Ignore copy
-    def get_config(self):
-        return CohereConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-            eos_token_id=self.pad_token_id,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = CohereModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = CohereModel(config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = CohereForCausalLM(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = CohereForCausalLM(config=config)
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class CohereModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (CohereModel, CohereForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (CohereForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": CohereModel,
-            "text-generation": CohereForCausalLM,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-    fx_compatible = True
-
-    # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
-    # This is because we are hitting edge cases with the causal_mask buffer
-    model_split_percents = [0.5, 0.7, 0.8]
-
-    def setUp(self):
-        self.model_tester = CohereModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CohereConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-
-@require_mindspore
-@slow
-class CohereIntegrationTest(unittest.TestCase):
-    def test_batched_small_model_logits(self):
-        # Since the model is very large, we created a random cohere model so that we can do a simple
-        # logits check on it.
-        model_id = "hf-internal-testing/cohere-random"
-
-        EXPECTED_LOGITS = mindspore.Tensor(
-            [
-                [[0.0000, 0.1866, -0.1997], [0.0000, -0.0736, 0.1785], [0.0000, -0.1965, -0.0569]],
-                [[0.0000, -0.0302, 0.1488], [0.0000, -0.0402, 0.1351], [0.0000, -0.0341, 0.1116]],
-            ]
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        model = CohereForCausalLM.from_pretrained(model_id, ms_dtype=mindspore.float16)
-
-        tokenizer.pad_token = tokenizer.eos_token
-
-        text = ["Hello today I am going to show you how to", "Hi there, here we are"]
-        inputs = tokenizer(text, return_tensors="ms", padding=True)
-        output = model(**inputs)
-
-        logits = output.logits
-        self.assertTrue(np.allclose(EXPECTED_LOGITS.asnumpy(), logits[:, :3, :3].asnumpy(), rtol=1e-3, atol=1e-3))
diff --git a/tests/transformers/models/conditional_detr/__init__.py b/tests/transformers/models/conditional_detr/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/transformers/models/conditional_detr/test_image_processing_conditional_detr.py
deleted file mode 100644
index b5c6a8483..000000000
--- a/tests/transformers/models/conditional_detr/test_image_processing_conditional_detr.py
+++ /dev/null
@@ -1,595 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import pathlib
-import unittest
-
-import numpy as np
-from mindspore import ops
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow, get_tests_dir
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_mindspore_available():
-    import mindspore
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import ConditionalDetrImageProcessor
-
-
-class ConditionalDetrImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_pad=True,
-    ):
-        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
-        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_pad = do_pad
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_pad": self.do_pad,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to ConditionalDetrImageProcessor,
-        assuming do_resize is set to True with a scalar size.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-    def expected_output_image_shape(self, images):
-        height, width = self.get_expected_values(images, batched=True)
-        return self.num_channels, height, width
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = ConditionalDetrImageProcessor if is_vision_available() else None
-    fixtures_path = pathlib.Path(get_tests_dir()) / 'fixtures/tests_samples/COCO'
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = ConditionalDetrImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
-        self.assertEqual(image_processor.do_pad, True)
-
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
-        )
-        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
-        self.assertEqual(image_processor.do_pad, False)
-
-    @slow
-    def test_call_pytorch_with_coco_detection_annotations(self):
-        # prepare image and target
-        image = Image.open(self.fixtures_path / "000000039769.png")
-        with open(self.fixtures_path / "coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"image_id": 39769, "annotations": target}
-
-        # encode them
-        image_processing = ConditionalDetrImageProcessor.from_pretrained("microsoft/conditional-detr-resnet-50")
-        encoding = image_processing(images=image, annotations=target, return_tensors="ms")
-
-        # verify pixel values
-        expected_shape = (1, 3, 800, 1066)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = mindspore.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(np.allclose(encoding["pixel_values"][0, 0, 0, :3].numpy(), expected_slice.numpy(), atol=1e-4))
-
-        # verify area
-        expected_area = mindspore.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
-        self.assertTrue(np.allclose(encoding["labels"][0]["area"].numpy(), expected_area.numpy()))
-        # verify boxes
-        expected_boxes_shape = (6, 4)
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = mindspore.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"][0].numpy(), expected_boxes_slice.numpy(), atol=1e-3))
-        # verify image_id
-        expected_image_id = mindspore.tensor([39769])
-        self.assertTrue(np.allclose(encoding["labels"][0]["image_id"].numpy(), expected_image_id.numpy()))
-        # verify is_crowd
-        expected_is_crowd = mindspore.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(np.allclose(encoding["labels"][0]["iscrowd"].numpy(), expected_is_crowd.numpy()))
-        # verify class_labels
-        expected_class_labels = mindspore.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(np.allclose(encoding["labels"][0]["class_labels"].numpy(), expected_class_labels.numpy()))
-        # verify orig_size
-        expected_orig_size = mindspore.tensor([480, 640])
-        self.assertTrue(np.allclose(encoding["labels"][0]["orig_size"].numpy(), expected_orig_size.numpy()))
-        # verify size
-        expected_size = mindspore.tensor([800, 1066])
-        self.assertTrue(np.allclose(encoding["labels"][0]["size"].numpy(), expected_size.numpy()))
-
-    @slow
-    def test_call_pytorch_with_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image = Image.open(self.fixtures_path / "000000039769.png")
-        with open(self.fixtures_path / "coco_panoptic_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-
-        masks_path = pathlib.Path(self.fixtures_path / "coco_panoptic")
-
-        # encode them
-        image_processing = ConditionalDetrImageProcessor(format="coco_panoptic")
-        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="ms")
-
-        # verify pixel values
-        expected_shape = (1, 3, 800, 1066)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = mindspore.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(np.allclose(encoding["pixel_values"][0, 0, 0, :3].numpy(), expected_slice.numpy(), atol=1e-4))
-
-        # verify area
-        expected_area = mindspore.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
-        self.assertTrue(np.allclose(encoding["labels"][0]["area"].numpy(), expected_area.numpy()))
-        # verify boxes
-        expected_boxes_shape = (6, 4)
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = mindspore.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"][0].numpy(), expected_boxes_slice.numpy(), atol=1e-3))
-        # verify image_id
-        expected_image_id = mindspore.tensor([39769])
-        self.assertTrue(np.allclose(encoding["labels"][0]["image_id"].numpy(), expected_image_id.numpy()))
-        # verify is_crowd
-        expected_is_crowd = mindspore.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(np.allclose(encoding["labels"][0]["iscrowd"].numpy(), expected_is_crowd.numpy()))
-        # verify class_labels
-        expected_class_labels = mindspore.tensor([17, 17, 63, 75, 75, 93])
-        self.assertTrue(np.allclose(encoding["labels"][0]["class_labels"].numpy(), expected_class_labels.numpy()))
-        # verify masks
-        expected_masks_sum = 822873
-        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
-        # verify orig_size
-        expected_orig_size = mindspore.tensor([480, 640])
-        self.assertTrue(np.allclose(encoding["labels"][0]["orig_size"].numpy(), expected_orig_size.numpy()))
-        # verify size
-        expected_size = mindspore.tensor([800, 1066])
-        self.assertTrue(np.allclose(encoding["labels"][0]["size"].numpy(), expected_size.numpy()))
-
-    @slow
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->ConditionalDetr, facebook/detr-resnet-50 ->microsoft/conditional-detr-resnet-50
-    def test_batched_coco_detection_annotations(self):
-        image_0 = Image.open(self.fixtures_path / "000000039769.png")
-        image_1 = Image.open(self.fixtures_path / "000000039769.png").resize((800, 800))
-
-        with open(self.fixtures_path / "coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        annotations_0 = {"image_id": 39769, "annotations": target}
-        annotations_1 = {"image_id": 39769, "annotations": target}
-
-        # Adjust the bounding boxes for the resized image
-        w_0, h_0 = image_0.size
-        w_1, h_1 = image_1.size
-        for i in range(len(annotations_1["annotations"])):
-            coords = annotations_1["annotations"][i]["bbox"]
-            new_bbox = [
-                coords[0] * w_1 / w_0,
-                coords[1] * h_1 / h_0,
-                coords[2] * w_1 / w_0,
-                coords[3] * h_1 / h_0,
-            ]
-            annotations_1["annotations"][i]["bbox"] = new_bbox
-
-        images = [image_0, image_1]
-        annotations = [annotations_0, annotations_1]
-
-        image_processing = ConditionalDetrImageProcessor()
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            return_tensors="ms",  # do_convert_annotations=True
-        )
-
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1066
-        expected_shape = (2, 3, postprocessed_height, postprocessed_width)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        expected_boxes_0 = mindspore.tensor(
-            [
-                [0.6879, 0.4609, 0.0755, 0.3691],
-                [0.2118, 0.3359, 0.2601, 0.1566],
-                [0.5011, 0.5000, 0.9979, 1.0000],
-                [0.5010, 0.5020, 0.9979, 0.9959],
-                [0.3284, 0.5944, 0.5884, 0.8112],
-                [0.8394, 0.5445, 0.3213, 0.9110],
-            ]
-        )
-        expected_boxes_1 = mindspore.tensor(
-            [
-                [0.4130, 0.2765, 0.0453, 0.2215],
-                [0.1272, 0.2016, 0.1561, 0.0940],
-                [0.3757, 0.4933, 0.7488, 0.9865],
-                [0.3759, 0.5002, 0.7492, 0.9955],
-                [0.1971, 0.5456, 0.3532, 0.8646],
-                [0.5790, 0.4115, 0.3430, 0.7161],
-            ]
-        )
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"].numpy(), expected_boxes_0.numpy(), rtol=1e-3))
-        self.assertTrue(np.allclose(encoding["labels"][1]["boxes"].numpy(), expected_boxes_1.numpy(), rtol=1e-3))
-
-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, (6, 800, 1066))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, (6, 800, 1066))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="ms",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = ops.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = ops.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = ops.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = ops.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"].numpy(), expected_boxes_0.numpy(), rtol=1))
-        self.assertTrue(np.allclose(encoding["labels"][1]["boxes"].numpy(), expected_boxes_1.numpy(), rtol=1))
-
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->ConditionalDetr
-    def test_batched_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
-
-        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-        annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-
-        w_0, h_0 = image_0.size
-        w_1, h_1 = image_1.size
-        for i in range(len(annotation_1["segments_info"])):
-            coords = annotation_1["segments_info"][i]["bbox"]
-            new_bbox = [
-                coords[0] * w_1 / w_0,
-                coords[1] * h_1 / h_0,
-                coords[2] * w_1 / w_0,
-                coords[3] * h_1 / h_0,
-            ]
-            annotation_1["segments_info"][i]["bbox"] = new_bbox
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        images = [image_0, image_1]
-        annotations = [annotation_0, annotation_1]
-
-        # encode them
-        image_processing = ConditionalDetrImageProcessor(format="coco_panoptic")
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_tensors="ms",
-            return_segmentation_masks=True,
-        )
-
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1066
-        expected_shape = (2, 3, postprocessed_height, postprocessed_width)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        expected_boxes_0 = mindspore.tensor(
-            [
-                [0.2625, 0.5437, 0.4688, 0.8625],
-                [0.7719, 0.4104, 0.4531, 0.7125],
-                [0.5000, 0.4927, 0.9969, 0.9854],
-                [0.1688, 0.2000, 0.2063, 0.0917],
-                [0.5492, 0.2760, 0.0578, 0.2187],
-                [0.4992, 0.4990, 0.9984, 0.9979],
-            ]
-        )
-        expected_boxes_1 = mindspore.tensor(
-            [
-                [0.1576, 0.3262, 0.2814, 0.5175],
-                [0.4634, 0.2463, 0.2720, 0.4275],
-                [0.3002, 0.2956, 0.5985, 0.5913],
-                [0.1013, 0.1200, 0.1238, 0.0550],
-                [0.3297, 0.1656, 0.0347, 0.1312],
-                [0.2997, 0.2994, 0.5994, 0.5987],
-            ]
-        )
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"].numpy(), expected_boxes_0.numpy(), rtol=1e-3))
-        self.assertTrue(np.allclose(encoding["labels"][1]["boxes"].numpy(), expected_boxes_1.numpy(), rtol=1e-3))
-
-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, (6, 800, 1066))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, (6, 800, 1066))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="ms",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = ops.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = ops.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = ops.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = ops.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"].numpy(), expected_boxes_0.numpy(), rtol=1))
-        self.assertTrue(np.allclose(encoding["labels"][1]["boxes"].numpy(), expected_boxes_1.numpy(), rtol=1))
-
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->ConditionalDetr
-    def test_max_width_max_height_resizing_and_pad_strategy(self):
-        image_1 = ops.ones([200, 100, 3], dtype=mindspore.uint8)
-
-        # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
-        image_processor = ConditionalDetrImageProcessor(
-            size={"height": 100, "width": 50},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_1], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 100, 50))
-
-        # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
-        image_processor = ConditionalDetrImageProcessor(
-            size={"height": 200, "width": 100},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_1], return_tensors="ms")
-
-        # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
-        image_processor = ConditionalDetrImageProcessor(
-            size={"height": 100, "width": 100}, do_pad=True, pad_size={"height": 100, "width": 100}
-        )
-        inputs = image_processor(images=[image_1], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 100, 100))
-
-        # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
-        image_processor = ConditionalDetrImageProcessor(
-            size={"height": 300, "width": 100},
-            do_pad=True,
-            pad_size={"height": 301, "width": 101},
-        )
-        inputs = image_processor(images=[image_1], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 301, 101))
-
-        ### Check for batch
-        image_2 = ops.ones([100, 150, 3], dtype=mindspore.uint8)
-
-        # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
-        image_processor = ConditionalDetrImageProcessor(
-            size={"height": 150, "width": 100},
-            do_pad=True,
-            pad_size={"height": 150, "width": 100},
-        )
-        inputs = image_processor(images=[image_1, image_2], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 150, 100))
-
-    def test_longest_edge_shortest_edge_resizing_strategy(self):
-        image_1 = ops.ones([958, 653, 3], dtype=mindspore.uint8)
-
-        # max size is set; width < height;
-        # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436
-        image_processor = ConditionalDetrImageProcessor(
-            size={"longest_edge": 640, "shortest_edge": 640},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_1], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 640, 436))
-
-        image_2 = ops.ones([653, 958, 3], dtype=mindspore.uint8)
-        # max size is set; height < width;
-        # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640
-        image_processor = ConditionalDetrImageProcessor(
-            size={"longest_edge": 640, "shortest_edge": 640},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_2], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 436, 640))
-
-        image_3 = ops.ones([100, 120, 3], dtype=mindspore.uint8)
-        # max size is set; width == size; height > max_size;
-        # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98
-        image_processor = ConditionalDetrImageProcessor(
-            size={"longest_edge": 118, "shortest_edge": 100},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_3], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 98, 118))
-
-        image_4 = ops.ones([128, 50, 3], dtype=mindspore.uint8)
-        # max size is set; height == size; width < max_size;
-        # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128
-        image_processor = ConditionalDetrImageProcessor(
-            size={"longest_edge": 256, "shortest_edge": 50},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_4], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 128, 50))
-
-        image_5 = ops.ones([50, 50, 3], dtype=mindspore.uint8)
-        # max size is set; height == width; width < max_size;
-        # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50
-        image_processor = ConditionalDetrImageProcessor(
-            size={"longest_edge": 117, "shortest_edge": 50},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_5], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 50, 50))
diff --git a/tests/transformers/models/conditional_detr/test_modeling_conditional_detr.py b/tests/transformers/models/conditional_detr/test_modeling_conditional_detr.py
deleted file mode 100644
index ee52925bf..000000000
--- a/tests/transformers/models/conditional_detr/test_modeling_conditional_detr.py
+++ /dev/null
@@ -1,600 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Conditional DETR model."""
-
-import pathlib 
-import inspect
-import math
-import unittest
-
-import numpy as np
-from mindnlp.transformers import ConditionalDetrConfig, ResNetConfig
-from mindnlp.utils.testing_utils import require_vision, slow, is_mindspore_available, require_mindspore, is_vision_available, get_tests_dir
-from mindnlp.utils import cached_property
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
-#from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-    from mindnlp.transformers import (
-        ConditionalDetrForObjectDetection,
-        ConditionalDetrForSegmentation,
-        ConditionalDetrModel,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import ConditionalDetrImageProcessor
-
-
-class ConditionalDetrModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=8,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=8,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        num_queries=12,
-        num_channels=3,
-        min_size=200,
-        max_size=200,
-        n_targets=8,
-        num_labels=91,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_queries = num_queries
-        self.num_channels = num_channels
-        self.min_size = min_size
-        self.max_size = max_size
-        self.n_targets = n_targets
-        self.num_labels = num_labels
-
-        # we also set the expected seq length for both encoder and decoder
-        self.encoder_seq_length = math.ceil(self.min_size / 32) * math.ceil(self.max_size / 32)
-        self.decoder_seq_length = self.num_queries
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size])
-
-        pixel_mask = ops.ones(self.batch_size, self.min_size, self.max_size)
-
-        labels = None
-        if self.use_labels:
-            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
-            labels = []
-            for i in range(self.batch_size):
-                target = {}
-                target["class_labels"] = ops.randint(
-                    low=0, high=self.num_labels, size=(self.n_targets,)
-                ).astype(mindspore.int32)
-                target["boxes"] = ops.rand(self.n_targets, 4)
-                target["masks"] = ops.rand(self.n_targets, self.min_size, self.max_size)
-                labels.append(target)
-
-        config = self.get_config()
-        return config, pixel_values, pixel_mask, labels
-
-    def get_config(self):
-        resnet_config = ResNetConfig(
-            num_channels=3,
-            embeddings_size=10,
-            hidden_sizes=[10, 20, 30, 40],
-            depths=[1, 1, 2, 1],
-            hidden_act="relu",
-            num_labels=3,
-            out_features=["stage2", "stage3", "stage4"],
-            out_indices=[2, 3, 4],
-        )
-        return ConditionalDetrConfig(
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            num_queries=self.num_queries,
-            num_labels=self.num_labels,
-            use_timm_backbone=False,
-            backbone_config=resnet_config,
-            backbone=None,
-            use_pretrained_backbone=False,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
-        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
-        return config, inputs_dict
-
-    def create_and_check_conditional_detr_model(self, config, pixel_values, pixel_mask, labels):
-        model = ConditionalDetrModel(config=config)
-        model.set_train(False)
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_conditional_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
-        model = ConditionalDetrForObjectDetection(config=config)
-        model.set_train(False)
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
-
-@require_mindspore
-class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (ConditionalDetrModel, ConditionalDetrForObjectDetection, ConditionalDetrForSegmentation,)if is_mindspore_available()else ()
-    
-    pipeline_model_mapping = (
-        {"image-feature-extraction": ConditionalDetrModel, "object-detection": ConditionalDetrForObjectDetection}
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_torchscript = False
-    test_pruning = False
-    test_head_masking = False
-    test_missing_keys = False
-    zero_init_hidden_state = True
-
-    # special case for head models
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ in ["ConditionalDetrForObjectDetection", "ConditionalDetrForSegmentation"]:
-                labels = []
-                for i in range(self.model_tester.batch_size):
-                    target = {}
-                    target["class_labels"] = ops.ones(
-                        self.model_tester.n_targets, dtype=mindspore.int64
-                    )
-                    target["boxes"] = ops.ones(
-                        self.model_tester.n_targets, 4, dtype=mindspore.float32
-                    )
-                    target["masks"] = ops.ones(
-                        self.model_tester.n_targets,
-                        self.model_tester.min_size,
-                        self.model_tester.max_size,
-                        dtype=mindspore.float32,
-                    )
-                    labels.append(target)
-                inputs_dict["labels"] = labels
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = ConditionalDetrModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ConditionalDetrConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_conditional_detr_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_conditional_detr_model(*config_and_inputs)
-
-    def test_conditional_detr_object_detection_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_conditional_detr_object_detection_head_model(*config_and_inputs)
-
-    # TODO: check if this works again for MindSpore 2.x.y
-    @unittest.skip(reason="Got `CUDA error: misaligned address` with MindSpore 2.0.0.")
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-    @unittest.skip(reason="Conditional DETR does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Conditional DETR does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(reason="Conditional DETR does not have a get_input_embeddings method")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Conditional DETR is not a generative model")
-    def test_generate_without_input_ids(self):
-        pass
-
-    @unittest.skip(reason="Conditional DETR does not use token embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @slow
-    def test_model_outputs_equivalence(self):
-        # TODO Niels: fix me!
-        pass
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        decoder_seq_length = self.model_tester.decoder_seq_length
-        encoder_seq_length = self.model_tester.encoder_seq_length
-        decoder_key_length = self.model_tester.decoder_seq_length
-        encoder_key_length = self.model_tester.encoder_seq_length
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            if self.is_encoder_decoder:
-                correct_outlen = 6
-
-                # loss is at first position
-                if "labels" in inputs_dict:
-                    correct_outlen += 1  # loss is added to beginning
-                # Object Detection model returns pred_logits and pred_boxes
-                if model_class.__name__ == "ConditionalDetrForObjectDetection":
-                    correct_outlen += 1
-                # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks
-                if model_class.__name__ == "ConditionalDetrForSegmentation":
-                    correct_outlen += 2
-                if "past_key_values" in outputs:
-                    correct_outlen += 1  # past_key_values have been returned
-
-                self.assertEqual(out_len, correct_outlen)
-
-                # decoder attentions
-                decoder_attentions = outputs.decoder_attentions
-                self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
-
-                # cross attentions
-                cross_attentions = outputs.cross_attentions
-                self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(cross_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        encoder_key_length,
-                    ],
-                )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-
-    @unittest.skip("MindSpore has no .grad")
-    def test_retain_grad_hidden_states_attentions(self):
-        # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        inputs = self._prepare_for_class(inputs_dict, model_class)
-
-        outputs = model(**inputs)
-
-        output = outputs[0]
-
-        encoder_hidden_states = outputs.encoder_hidden_states[0]
-        encoder_attentions = outputs.encoder_attentions[0]
-        encoder_hidden_states.retain_grad()
-        encoder_attentions.retain_grad()
-
-        decoder_attentions = outputs.decoder_attentions[0]
-        decoder_attentions.retain_grad()
-
-        cross_attentions = outputs.cross_attentions[0]
-        cross_attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(encoder_hidden_states.grad)
-        self.assertIsNotNone(encoder_attentions.grad)
-        self.assertIsNotNone(decoder_attentions.grad)
-        self.assertIsNotNone(cross_attentions.grad)
-
-    def test_forward_auxiliary_loss(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.auxiliary_loss = True
-
-        # only test for object detection and segmentation model
-        for model_class in self.all_model_classes[1:]:
-            model = model_class(config)
-
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-            outputs = model(**inputs)
-
-            self.assertIsNotNone(outputs.auxiliary_outputs)
-            self.assertEqual(len(outputs.auxiliary_outputs), self.model_tester.num_hidden_layers - 1)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = ["pixel_values", "pixel_mask"]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
-                    if "head_mask" in arg_names and "decoder_head_mask" in arg_names
-                    else []
-                )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-            else:
-                expected_arg_names = ["pixel_values", "pixel_mask"]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    @unittest.skip("MindNLP does not depend on timm")
-    def test_different_timm_backbone(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # let's pick a random timm backbone
-        config.backbone = "tf_mobilenetv3_small_075"
-        config.backbone_config = None
-        config.use_timm_backbone = True
-        config.backbone_kwargs = {"out_indices": [2, 3, 4]}
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if model_class.__name__ == "ConditionalDetrForObjectDetection":
-                expected_shape = (
-                    self.model_tester.batch_size,
-                    self.model_tester.num_queries,
-                    self.model_tester.num_labels,
-                )
-                self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
-                self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
-            elif model_class.__name__ == "ConditionalDetrForSegmentation":
-                # Confirm out_indices was propogated to backbone
-                self.assertEqual(len(model.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
-            else:
-                # Confirm out_indices was propogated to backbone
-                self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
-
-            self.assertTrue(outputs)
-    
-    def test_hf_backbone(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Load a pretrained HF checkpoint as backbone
-        config.backbone = "microsoft/resnet-18"
-        config.backbone_config = None
-        config.use_timm_backbone = False
-        config.use_pretrained_backbone = True
-        config.backbone_kwargs = {"out_indices": [2, 3, 4]}
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if model_class.__name__ == "ConditionalDetrForObjectDetection":
-                expected_shape = (
-                    self.model_tester.batch_size,
-                    self.model_tester.num_queries,
-                    self.model_tester.num_labels,
-                )
-                self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
-                self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
-            elif model_class.__name__ == "ConditionalDetrForSegmentation":
-                # Confirm out_indices was propogated to backbone
-                self.assertEqual(len(model.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
-            else:
-                # Confirm out_indices was propogated to backbone
-                self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
-
-            self.assertTrue(outputs)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        configs_no_init.init_xavier_std = 1e9
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    if "bbox_attention" in name and "bias" not in name:
-                        self.assertLess(
-                            100000,
-                            abs(param.data.max().item()),
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    @unittest.skip("MindNLP AutoModel.from_pretrained() not compatible")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-
-TOLERANCE = 1e-4
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open(pathlib.Path(get_tests_dir())  / "fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@unittest.skip("MindNLP does not depend on timm")
-@require_vision
-@slow
-class ConditionalDetrModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            ConditionalDetrImageProcessor.from_pretrained("microsoft/conditional-detr-resnet-50")
-            if is_vision_available()
-            else None
-        )
-
-    def test_inference_no_head(self):
-        model = ConditionalDetrModel.from_pretrained("microsoft/conditional-detr-resnet-50")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        encoding = image_processor(images=image, return_tensors="ms")
-
-
-        outputs = model(**encoding)
-
-        expected_shape = (1, 300, 256)
-        assert outputs.last_hidden_state.shape == expected_shape
-        expected_slice = mindspore.tensor(
-            [[0.4222, 0.7471, 0.8760], [0.6395, -0.2729, 0.7127], [-0.3090, 0.7642, 0.9529]]
-        )
-        self.assertTrue(np.allclose(outputs.last_hidden_state[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-    def test_inference_object_detection_head(self):
-        model = ConditionalDetrForObjectDetection.from_pretrained("microsoft/conditional-detr-resnet-50")
-        
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        encoding = image_processor(images=image, return_tensors="ms")
-        pixel_values = encoding["pixel_values"]
-        pixel_mask = encoding["pixel_mask"]
-
-        outputs = model(pixel_values, pixel_mask)
-
-        # verify logits + box predictions
-        expected_shape_logits = (1, model.config.num_queries, model.config.num_labels)
-        self.assertEqual(outputs.logits.shape, expected_shape_logits)
-        expected_slice_logits = mindspore.tensor(
-            [[-10.4372, -5.7558, -8.6764], [-10.5410, -5.8704, -8.0590], [-10.6827, -6.3469, -8.3923]]
-        )
-        self.assertTrue(np.allclose(outputs.logits[0, :3, :3].asnumpy(), expected_slice_logits.asnumpy(), atol=1e-4))
-
-        expected_shape_boxes = (1, model.config.num_queries, 4)
-        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
-        expected_slice_boxes = mindspore.tensor(
-            [[0.7733, 0.6576, 0.4496], [0.5171, 0.1184, 0.9094], [0.8846, 0.5647, 0.2486]]
-        )
-        self.assertTrue(np.allclose(outputs.pred_boxes[0, :3, :3].asnumpy(), expected_slice_boxes.asnumpy(), atol=1e-4))
-
-        # verify postprocessing
-        results = image_processor.post_process_object_detection(
-            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
-        )[0]
-        expected_scores = mindspore.tensor([0.8330, 0.8313, 0.8039, 0.6829, 0.5355])
-        expected_labels = [75, 17, 17, 75, 63]
-        expected_slice_boxes = mindspore.tensor([38.3089, 72.1022, 177.6293, 118.4512])
-
-        self.assertEqual(len(results["scores"]), 5)
-        self.assertTrue(np.allclose(results["scores"].asnumpy(), expected_scores.asnumpy(), atol=1e-4))
-        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
-        self.assertTrue(np.allclose(results["boxes"][0, :].asnumpy(), expected_slice_boxes.asnumpy()))
diff --git a/tests/transformers/models/convbert/__init__.py b/tests/transformers/models/convbert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/convbert/test_modeling_convbert.py b/tests/transformers/models/convbert/test_modeling_convbert.py
deleted file mode 100644
index 44a4e82f1..000000000
--- a/tests/transformers/models/convbert/test_modeling_convbert.py
+++ /dev/null
@@ -1,483 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-""" Testing suite for the PyTorch ConvBERT model. """
-import unittest
-
-from mindnlp.transformers import ConvBertConfig, get_values
-from mindnlp.utils.testing_utils import (
-    require_mindspore, slow, is_mindspore_available
-)
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import (
-        CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        ConvBertForMaskedLM,
-        ConvBertForMultipleChoice,
-        ConvBertForQuestionAnswering,
-        ConvBertForSequenceClassification,
-        ConvBertForTokenClassification,
-        ConvBertModel,
-    )
-
-
-class ConvBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor(
-            [self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask(
-                [self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor(
-                [self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor(
-                [self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor(
-                [self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return ConvBertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor(
-            [self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor(
-            [self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ConvBertModel(config=config)
-        result = model(input_ids, attention_mask=input_mask,
-                       token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape,
-                                (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ConvBertForMaskedLM(config=config)
-        result = model(input_ids, attention_mask=input_mask,
-                       token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ConvBertForQuestionAnswering(config=config)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape,
-                                (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape,
-                                (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = ConvBertForSequenceClassification(config)
-        result = model(input_ids, attention_mask=input_mask,
-                       token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape,
-                                (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = ConvBertForTokenClassification(config=config)
-        result = model(input_ids, attention_mask=input_mask,
-                       token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = ConvBertForMultipleChoice(config=config)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(
-            1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(
-            1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(
-            1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape,
-                                (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids,
-                       "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ConvBertModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            ConvBertModel,
-            ConvBertForMaskedLM,
-            ConvBertForMultipleChoice,
-            ConvBertForQuestionAnswering,
-            ConvBertForSequenceClassification,
-            ConvBertForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": ConvBertModel,
-            "fill-mask": ConvBertForMaskedLM,
-            "question-answering": ConvBertForQuestionAnswering,
-            "text-classification": ConvBertForSequenceClassification,
-            "token-classification": ConvBertForTokenClassification,
-            "zero-shot": ConvBertForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = ConvBertModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=ConvBertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(
-            *config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(
-            *config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(
-            *config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(
-            *config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = ConvBertModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(
-            self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(
-            self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(
-            self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(
-            self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            outputs = model(
-                **self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(
-                len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            outputs = model(
-                **self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(
-                len(attentions), self.model_tester.num_hidden_layers)
-
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads / 2,
-                        encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads / 2,
-                        encoder_seq_length, encoder_key_length],
-                )
-            out_len = len(outputs)
-
-            if self.is_encoder_decoder:
-                correct_outlen = 5
-
-                # loss is at first position
-                if "labels" in inputs_dict:
-                    correct_outlen += 1  # loss is added to beginning
-                # Question Answering model returns start_logits and end_logits
-                if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
-                if "past_key_values" in outputs:
-                    correct_outlen += 1  # past_key_values have been returned
-
-                self.assertEqual(out_len, correct_outlen)
-
-                # decoder attentions
-                decoder_attentions = outputs.decoder_attentions
-                self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(len(decoder_attentions),
-                                 self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads,
-                        decoder_seq_length, decoder_key_length],
-                )
-
-                # cross attentions
-                cross_attentions = outputs.cross_attentions
-                self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(len(cross_attentions),
-                                 self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(cross_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        encoder_key_length,
-                    ],
-                )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            outputs = model(
-                **self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions),
-                             self.model_tester.num_hidden_layers)
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads / 2,
-                        encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads / 2,
-                        encoder_seq_length, encoder_key_length],
-                )
-
-    def test_model_for_input_embeds(self):
-        batch_size = 2
-        seq_length = 10
-        inputs_embeds = mindspore.ops.rand([batch_size, seq_length, 768])
-        config = self.model_tester.get_config()
-        model = ConvBertModel(config=config)
-        result = model(inputs_embeds=inputs_embeds)
-        self.assertEqual(result.last_hidden_state.shape,
-                         (batch_size, seq_length, config.hidden_size))
-
-    def test_reducing_attention_heads(self):
-        config, *inputs_dict = self.model_tester.prepare_config_and_inputs()
-        config.head_ratio = 4
-        self.model_tester.create_and_check_for_masked_lm(config, *inputs_dict)
-
-
-@require_mindspore
-class ConvBertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head(self):
-        model = ConvBertModel.from_pretrained("YituTech/conv-bert-base")
-        input_ids = mindspore.tensor([[1, 2, 3, 4, 5, 6]])
-        output = model(input_ids)[0]
-
-        expected_shape = (1, 6, 768)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = mindspore.Tensor(
-            input_data=[[[-0.0864, -0.4898, -0.3677],
-                         [0.1434, -0.2952, -0.7640],
-                         [-0.0112, -0.4432, -0.5432]]]
-        )
-
-        self.assertTrue(mindspore.allclose(
-            output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/transformers/models/convnext/__init__.py b/tests/transformers/models/convnext/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/convnext/test_modeling_convnext.py b/tests/transformers/models/convnext/test_modeling_convnext.py
deleted file mode 100644
index 3015ae821..000000000
--- a/tests/transformers/models/convnext/test_modeling_convnext.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore ConvNext model. """
-
-
-import unittest
-import numpy as np
-
-from mindnlp.transformers import ConvNextConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_backbone_common import BackboneTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import ConvNextBackbone, ConvNextForImageClassification, ConvNextModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import AutoImageProcessor
-
-
-class ConvNextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=32,
-        num_channels=3,
-        num_stages=4,
-        hidden_sizes=[10, 20, 30, 40],
-        depths=[2, 2, 3, 2],
-        is_training=True,
-        use_labels=True,
-        intermediate_size=37,
-        hidden_act="gelu",
-        num_labels=10,
-        initializer_range=0.02,
-        out_features=["stage2", "stage3", "stage4"],
-        out_indices=[2, 3, 4],
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.num_stages = num_stages
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.num_labels = num_labels
-        self.initializer_range = initializer_range
-        self.out_features = out_features
-        self.out_indices = out_indices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return ConvNextConfig(
-            num_channels=self.num_channels,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            num_stages=self.num_stages,
-            hidden_act=self.hidden_act,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            out_features=self.out_features,
-            out_indices=self.out_indices,
-            num_labels=self.num_labels,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = ConvNextModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        # expected last hidden states: B, C, H // 32, W // 32
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        model = ConvNextForImageClassification(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = ConvNextBackbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify hidden states
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 4, 4])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-        self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = ConvNextBackbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[-1], 1, 1])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-        self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ConvNextModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ConvNext does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            ConvNextModel,
-            ConvNextForImageClassification,
-            ConvNextBackbone,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"image-feature-extraction": ConvNextModel, "image-classification": ConvNextForImageClassification}
-        if is_mindspore_available()
-        else {}
-    )
-
-    fx_compatible = True
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = ConvNextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ConvNextConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    @unittest.skip(reason="ConvNext does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="ConvNext does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="ConvNext does not use feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_backbone(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_stages = self.model_tester.num_stages
-            self.assertEqual(len(hidden_states), expected_num_stages + 1)
-
-            # ConvNext's feature maps are of shape (batch_size, num_channels, height, width)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/convnext-tiny-224"
-        model = ConvNextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class ConvNextModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = ConvNextForImageClassification.from_pretrained("facebook/convnext-tiny-224")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-0.0260, -0.4739, 0.1911])
-        print(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy())
-        self.assertTrue(np.allclose(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-
-@require_mindspore
-class ConvNextBackboneTest(unittest.TestCase, BackboneTesterMixin):
-    all_model_classes = (ConvNextBackbone,) if is_mindspore_available() else ()
-    config_class = ConvNextConfig
-
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = ConvNextModelTester(self)
\ No newline at end of file
diff --git a/tests/transformers/models/convnextv2/__init__.py b/tests/transformers/models/convnextv2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/convnextv2/test_modeling_convnextv2.py b/tests/transformers/models/convnextv2/test_modeling_convnextv2.py
deleted file mode 100644
index 39b97ad07..000000000
--- a/tests/transformers/models/convnextv2/test_modeling_convnextv2.py
+++ /dev/null
@@ -1,332 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore ConvNextV2 model. """
-
-
-import unittest
-import numpy as np
-
-from mindnlp.transformers.models.convnextv2 import ConvNextV2Config
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.transformers.models.auto.modeling_auto import MODEL_FOR_BACKBONE_MAPPING_NAMES, MODEL_MAPPING_NAMES
-from mindnlp.utils.testing_utils import  require_vision, slow, require_mindspore, is_mindspore_available
-from mindnlp.utils.import_utils import  is_vision_available
-from mindnlp.utils import  cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.transformers import ConvNextV2ForImageClassification, ConvNextV2Model, ConvNextV2Backbone
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import AutoImageProcessor
-
-
-class ConvNextV2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=32,
-        num_channels=3,
-        num_stages=4,
-        hidden_sizes=[10, 20, 30, 40],
-        depths=[2, 2, 3, 2],
-        is_training=True,
-        use_labels=True,
-        intermediate_size=37,
-        hidden_act="gelu",
-        num_labels=10,
-        initializer_range=0.02,
-        out_features=["stage2", "stage3", "stage4"],
-        out_indices=[2, 3, 4],
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.num_stages = num_stages
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.num_labels = num_labels
-        self.initializer_range = initializer_range
-        self.out_features = out_features
-        self.out_indices = out_indices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return ConvNextV2Config(
-            num_channels=self.num_channels,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            num_stages=self.num_stages,
-            hidden_act=self.hidden_act,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            out_features=self.out_features,
-            out_indices=self.out_indices,
-            num_labels=self.num_labels,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = ConvNextV2Model(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        # expected last hidden states: B, C, H // 32, W // 32
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        model = ConvNextV2ForImageClassification(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = ConvNextV2Backbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify hidden states
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 4, 4])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-        self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = ConvNextV2Backbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[-1], 1, 1])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-        self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_with_labels(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values, "labels": labels}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ConvNextV2ModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ConvNextV2 does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            ConvNextV2ForImageClassification,
-            ConvNextV2Backbone,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"image-feature-extraction": ConvNextV2Model, "image-classification": ConvNextV2ForImageClassification}
-        if is_mindspore_available()
-        else {}
-    )
-
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = ConvNextV2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ConvNextV2Config, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    @unittest.skip(reason="ConvNextV2 does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="ConvNextV2 does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="ConvNextV2 does not use feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_with_labels()
-            config.return_dict = True
-
-            if model_class.__name__ in [
-                *get_values(MODEL_MAPPING_NAMES),
-                *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES),
-            ]:
-                continue
-
-            model = model_class(config)
-            model.set_train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-
-    def test_training_gradient_checkpointing(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_with_labels()
-            config.use_cache = False
-            config.return_dict = True
-
-            if (
-                model_class.__name__
-                in [*get_values(MODEL_MAPPING_NAMES), *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)]
-            ):
-                continue
-
-            model = model_class(config)
-            model.set_train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_stages = self.model_tester.num_stages
-            self.assertEqual(len(hidden_states), expected_num_stages + 1)
-
-            # ConvNextV2's feature maps are of shape (batch_size, num_channels, height, width)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/convnextv2-tiny-1k-224"
-        model = ConvNextV2Model.from_pretrained(model_name, from_pt = True)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class ConvNextV2ModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224", from_pt = True) if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = ConvNextV2ForImageClassification.from_pretrained("facebook/convnextv2-tiny-1k-224", from_pt = True)
-
-        #preprocessor = self.default_image_processor
-        preprocessor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224", from_pt = True)
-        image = prepare_img()
-        inputs = preprocessor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([0.9996, 0.1966, -0.4386])
-        self.assertTrue(np.allclose(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/cpmant/__init__.py b/tests/transformers/models/cpmant/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/cpmant/test_modeling_cpmant.py b/tests/transformers/models/cpmant/test_modeling_cpmant.py
deleted file mode 100644
index a7e41f018..000000000
--- a/tests/transformers/models/cpmant/test_modeling_cpmant.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore CPMAnt model. """
-
-import unittest
-import numpy as np
-from mindnlp.utils.testing_utils import is_mindspore_available, require_mindspore, tooslow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        CpmAntConfig,
-        CpmAntForCausalLM,
-        CpmAntModel,
-        CpmAntTokenizer,
-    )
-
-@require_mindspore
-class CpmAntModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        seq_length=8,
-        is_training=True,
-        use_token_type_ids=False,
-        use_input_mask=False,
-        use_labels=False,
-        use_mc_token_ids=False,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        num_buckets=32,
-        max_distance=128,
-        prompt_length=8,
-        prompt_types=8,
-        segment_types=8,
-        init_std=1.0,
-        return_dict=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.use_mc_token_ids = use_mc_token_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.num_buckets = num_buckets
-        self.max_distance = max_distance
-        self.prompt_length = prompt_length
-        self.prompt_types = prompt_types
-        self.segment_types = segment_types
-        self.init_std = init_std
-        self.return_dict = return_dict
-
-    def prepare_config_and_inputs(self):
-        input_ids = {}
-        input_ids["input_ids"] = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).astype(mindspore.int32)
-        input_ids["use_cache"] = False
-
-        config = self.get_config()
-
-        return (config, input_ids)
-
-    def get_config(self):
-        return CpmAntConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            dim_ff=self.intermediate_size,
-            position_bias_num_buckets=self.num_buckets,
-            position_bias_max_distance=self.max_distance,
-            prompt_types=self.prompt_types,
-            prompt_length=self.prompt_length,
-            segment_types=self.segment_types,
-            use_cache=True,
-            init_std=self.init_std,
-            return_dict=self.return_dict,
-        )
-
-    def create_and_check_cpmant_model(self, config, input_ids, *args):
-        model = CpmAntModel(config=config)
-
-        model.set_train(False)
-
-        hidden_states = model(**input_ids).last_hidden_state
-
-        self.parent.assertEqual(hidden_states.shape, (self.batch_size, self.seq_length, config.hidden_size))
-
-    def create_and_check_lm_head_model(self, config, input_ids, *args):
-        model = CpmAntForCausalLM(config)
-
-        input_ids["input_ids"] = input_ids["input_ids"]
-        model.set_train(False)
-
-        model_output = model(**input_ids)
-        self.parent.assertEqual(
-            model_output.logits.shape,
-            (self.batch_size, self.seq_length, config.vocab_size + config.prompt_types * config.prompt_length),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-
-@require_mindspore
-class CpmAntModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (CpmAntModel, CpmAntForCausalLM) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": CpmAntModel, "text-generation": CpmAntForCausalLM} if is_mindspore_available() else {}
-    )
-
-    test_pruning = False
-    test_missing_keys = False
-    test_mismatched_shapes = False
-    test_head_masking = False
-    test_resize_embeddings = False
-
-    def setUp(self):
-        self.model_tester = CpmAntModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CpmAntConfig)
-
-    def test_config(self):
-        self.config_tester.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def test_inputs_embeds(self):
-        unittest.skip("CPMAnt doesn't support input_embeds.")(self.test_inputs_embeds)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        unittest.skip(
-            "CPMAnt doesn't support retain grad in hidden_states or attentions, because prompt management will peel off the output.hidden_states from graph.\
-                 So is attentions. We strongly recommand you use loss to tune model."
-        )(self.test_retain_grad_hidden_states_attentions)
-
-    def test_cpmant_model(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_cpmant_model(config, inputs)
-
-    def test_cpmant_lm_head_model(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(config, inputs)
-
-
-@require_mindspore
-class CpmAntModelIntegrationTest(unittest.TestCase):
-    @tooslow
-    def test_inference_masked_lm(self):
-        texts = "今天天气真好！"
-        model_path = "openbmb/cpm-ant-10b"
-        model = CpmAntModel.from_pretrained(model_path)
-        tokenizer = CpmAntTokenizer.from_pretrained(model_path)
-        inputs = tokenizer(texts, return_tensors="ms")
-        hidden_states = model(**inputs).last_hidden_state
-
-        expected_slice = mindspore.tensor(
-            [[[6.1708, 5.9244, 1.0835], [6.5207, 6.2893, -11.3324], [-1.0107, -0.0576, -5.9577]]],
-        )
-        self.assertTrue(np.allclose(hidden_states[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-2))
-
-
-@require_mindspore
-class CpmAntForCausalLMlIntegrationTest(unittest.TestCase):
-    @tooslow
-    def test_inference_casual(self):
-        texts = "今天天气真好！"
-        model_path = "openbmb/cpm-ant-10b"
-        model = CpmAntForCausalLM.from_pretrained(model_path)
-        tokenizer = CpmAntTokenizer.from_pretrained(model_path)
-        inputs = tokenizer(texts, return_tensors="ms")
-        hidden_states = model(**inputs).logits
-
-        expected_slice = mindspore.tensor(
-            [[[-6.4267, -6.4083, -6.3958], [-5.8802, -5.9447, -5.7811], [-5.3896, -5.4820, -5.4295]]],
-        )
-        self.assertTrue(np.allclose(hidden_states[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-2))
-
-    @tooslow
-    def test_simple_generation(self):
-        model_path = "openbmb/cpm-ant-10b"
-        model = CpmAntForCausalLM.from_pretrained(model_path)
-        tokenizer = CpmAntTokenizer.from_pretrained(model_path)
-        texts = "今天天气不错，"
-        expected_output = "今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的"
-        model_inputs = tokenizer(texts, return_tensors="ms")
-        token_ids = model.generate(**model_inputs)
-        output_texts = tokenizer.batch_decode(token_ids)
-        self.assertEqual(expected_output, output_texts)
-
-    @tooslow
-    def test_batch_generation(self):
-        model_path = "openbmb/cpm-ant-10b"
-        model = CpmAntForCausalLM.from_pretrained(model_path)
-        tokenizer = CpmAntTokenizer.from_pretrained(model_path)
-        texts = ["今天天气不错，", "新年快乐，万事如意！"]
-        expected_output = [
-            "今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的",
-            "新年快乐，万事如意！在这辞旧迎新的美好时刻，我谨代表《农村新技术》杂志社全体同仁，向一直以来关心、支持《农村新技术》杂志发展的各级领导、各界朋友和广大读者致以最诚挚的",
-        ]
-        model_inputs = tokenizer(texts, return_tensors="ms", padding=True)
-        token_ids = model.generate(**model_inputs)
-        output_texts = tokenizer.batch_decode(token_ids)
-        self.assertEqual(expected_output, output_texts)
\ No newline at end of file
diff --git a/tests/transformers/models/cpmbee/__init__.py b/tests/transformers/models/cpmbee/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/cpmbee/test_modeling_cpmbee.py b/tests/transformers/models/cpmbee/test_modeling_cpmbee.py
deleted file mode 100644
index 5d4c19c54..000000000
--- a/tests/transformers/models/cpmbee/test_modeling_cpmbee.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore CpmBee model. """
-
-
-import unittest
-
-from mindnlp.utils.testing_utils import is_mindspore_available, require_mindspore, tooslow, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        CpmBeeConfig,
-        CpmBeeForCausalLM,
-        CpmBeeModel,
-        CpmBeeTokenizer,
-    )
-
-@require_mindspore
-class CpmBeeModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        seq_length=8,
-        is_training=True,
-        use_token_type_ids=False,
-        use_input_mask=False,
-        use_labels=False,
-        use_mc_token_ids=False,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=3,
-        num_attention_heads=4,
-        intermediate_size=37,
-        num_buckets=32,
-        max_distance=128,
-        position_bias_num_segment_buckets=32,
-        init_std=1.0,
-        return_dict=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.use_mc_token_ids = use_mc_token_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.num_buckets = num_buckets
-        self.max_distance = max_distance
-        self.position_bias_num_segment_buckets = position_bias_num_segment_buckets
-        self.init_std = init_std
-        self.return_dict = return_dict
-
-    def prepare_config_and_inputs(self):
-        input_ids = {}
-        input_ids["input_ids"] = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids["span"] = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids["length"] = ops.full((self.batch_size,), self.seq_length, dtype=mindspore.int64)
-        input_ids["use_cache"] = False
-
-        config = self.get_config()
-
-        return (config, input_ids)
-
-    def get_config(self):
-        return CpmBeeConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            dim_ff=self.intermediate_size,
-            position_bias_num_buckets=self.num_buckets,
-            position_bias_max_distance=self.max_distance,
-            position_bias_num_segment_buckets=self.position_bias_num_segment_buckets,
-            use_cache=True,
-            init_std=self.init_std,
-            return_dict=self.return_dict,
-        )
-
-    def create_and_check_cpmbee_model(self, config, input_ids, *args):
-        model = CpmBeeModel(config=config)
-        model.set_train(False)
-
-        hidden_states = model(**input_ids).last_hidden_state
-
-        self.parent.assertEqual(hidden_states.shape, (self.batch_size, self.seq_length, config.hidden_size))
-
-    def create_and_check_lm_head_model(self, config, input_ids, *args):
-        model = CpmBeeForCausalLM(config)
-        input_ids["input_ids"] = input_ids["input_ids"]
-        model.set_train(False)
-
-        model_output = model(**input_ids)
-        self.parent.assertEqual(
-            model_output.logits.shape,
-            (self.batch_size, self.seq_length, config.vocab_size),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-
-@require_mindspore
-class CpmBeeModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (CpmBeeModel, CpmBeeForCausalLM) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": CpmBeeModel, "text-generation": CpmBeeForCausalLM} if is_mindspore_available() else {}
-    )
-
-    test_pruning = False
-    test_missing_keys = False
-    test_mismatched_shapes = False
-    test_head_masking = False
-    test_resize_embeddings = False
-
-    def setUp(self):
-        self.model_tester = CpmBeeModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CpmBeeConfig)
-
-    def test_config(self):
-        self.config_tester.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def test_inputs_embeds(self):
-        unittest.skip("CPMBee doesn't support input_embeds.")(self.test_inputs_embeds)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        unittest.skip(
-            "CPMBee doesn't support retain grad in hidden_states or attentions, because prompt management will peel off the output.hidden_states from graph.\
-                 So is attentions. We strongly recommand you use loss to tune model."
-        )(self.test_retain_grad_hidden_states_attentions)
-
-    def test_cpmbee_model(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_cpmbee_model(config, inputs)
-
-    def test_cpmbee_lm_head_model(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(config, inputs)
-
-
-@require_mindspore
-class CpmBeeForCausalLMlIntegrationTest(unittest.TestCase):
-    @tooslow
-    def test_simple_generation(self):
-        texts = {"input": "今天天气不错，", "<ans>": ""}
-        model = CpmBeeForCausalLM.from_pretrained("openbmb/cpm-bee-10b")
-        tokenizer = CpmBeeTokenizer.from_pretrained("openbmb/cpm-bee-10b")
-        output_texts = model.generate(texts, tokenizer)
-        expected_output = {"input": "今天天气不错，", "<ans>": "适合睡觉。"}
-        self.assertEqual(expected_output["<ans>"], output_texts[0]["<ans>"])
-
-    @slow
-    def test_simple_generation_1b(self):
-        texts = {"input": "今天天气不错，", "<ans>": ""}
-        model = CpmBeeForCausalLM.from_pretrained("openbmb/cpm-bee-1b")
-        tokenizer = CpmBeeTokenizer.from_pretrained("openbmb/cpm-bee-1b")
-        output_texts = model.generate(texts, tokenizer)
-        expected_output = {"input": "今天天气不错，", "<ans>": "适合睡觉。"}
-        self.assertEqual(expected_output["<ans>"], output_texts[0]["<ans>"])
-
-    @slow
-    def test_simple_generation_2b(self):
-        texts = {"input": "今天天气不错，", "<ans>": ""}
-        model = CpmBeeForCausalLM.from_pretrained("openbmb/cpm-bee-2b")
-        tokenizer = CpmBeeTokenizer.from_pretrained("openbmb/cpm-bee-2b")
-        output_texts = model.generate(texts, tokenizer)
-        expected_output = {"input": "今天天气不错，", "<ans>": "适合睡觉。"}
-        self.assertEqual(expected_output["<ans>"], output_texts[0]["<ans>"])
-
-    @slow
-    def test_simple_generation_5b(self):
-        texts = {"input": "今天天气不错，", "<ans>": ""}
-        model = CpmBeeForCausalLM.from_pretrained("openbmb/cpm-bee-5b")
-        tokenizer = CpmBeeTokenizer.from_pretrained("openbmb/cpm-bee-5b")
-        output_texts = model.generate(texts, tokenizer)
-        expected_output = {"input": "今天天气不错，", "<ans>": "适合睡觉。"}
-        self.assertEqual(expected_output["<ans>"], output_texts[0]["<ans>"])
diff --git a/tests/transformers/models/ctrl/__init__.py b/tests/transformers/models/ctrl/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/ctrl/test_modeling_ctrl.py b/tests/transformers/models/ctrl/test_modeling_ctrl.py
deleted file mode 100644
index 98ea014cc..000000000
--- a/tests/transformers/models/ctrl/test_modeling_ctrl.py
+++ /dev/null
@@ -1,318 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Salesforce and HuggingFace Inc. team.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""test ctrl modeling"""
-import gc
-import unittest
-
-from mindnlp.transformers import CTRLConfig
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import (
-        CTRLForSequenceClassification,
-        CTRLLMHeadModel,
-        CTRLModel,
-    )
-
-
-class CTRLModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_token_type_ids=True,
-        use_input_mask=True,
-        use_labels=True,
-        use_mc_token_ids=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.use_mc_token_ids = use_mc_token_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.pad_token_id = self.vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor(
-                [self.batch_size, self.seq_length], self.type_vocab_size
-            )
-
-        mc_token_ids = None
-        if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor(
-                [self.batch_size, self.num_choices], self.seq_length
-            )
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor(
-                [self.batch_size], self.type_sequence_label_size
-            )
-            token_labels = ids_tensor(
-                [self.batch_size, self.seq_length], self.num_labels
-            )
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return CTRLConfig(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            dff=self.intermediate_size,
-            # hidden_act=self.hidden_act,
-            # hidden_dropout_prob=self.hidden_dropout_prob,
-            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            # type_vocab_size=self.type_vocab_size,
-            # initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def create_and_check_ctrl_model(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = CTRLModel(config=config)
-        model.set_train(False)
-
-        model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-        model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-        self.parent.assertEqual(len(result.past_key_values), config.n_layer)
-
-    def create_and_check_lm_head_model(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = CTRLLMHeadModel(config)
-        model.set_train(False)
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "head_mask": head_mask,
-        }
-
-        return config, inputs_dict
-
-    def create_and_check_ctrl_for_sequence_classification(
-        self, config, input_ids, head_mask, token_type_ids, *args
-    ):
-        config.num_labels = self.num_labels
-        model = CTRLForSequenceClassification(config)
-        model.set_train(False)
-        sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-        result = model(input_ids, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-
-@require_mindspore
-class CTRLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (CTRLModel, CTRLLMHeadModel, CTRLForSequenceClassification)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (CTRLLMHeadModel,) if is_mindspore_available() else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": CTRLModel,
-            "text-classification": CTRLForSequenceClassification,
-            "text-generation": CTRLLMHeadModel,
-            "zero-shot": CTRLForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = True
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    # TODO: Fix the failed tests
-    # def is_pipeline_test_to_skip(
-    #     self,
-    #     pipeline_test_casse_name,
-    #     config_class,
-    #     model_architecture,
-    #     tokenizer_name,
-    #     processor_name,
-    # ):
-    #     if pipeline_test_casse_name == "ZeroShotClassificationPipelineTests":
-    #         # Get `tokenizer does not have a padding token` error for both fast/slow tokenizers.
-    #         # `CTRLConfig` was never used in pipeline tests, either because of a missing checkpoint or because a tiny
-    #         # config could not be created.
-    #         return True
-
-    #     return False
-
-    def setUp(self):
-        self.model_tester = CTRLModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CTRLConfig, n_embd=37)
-
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_ctrl_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_ctrl_model(*config_and_inputs)
-
-    def test_ctrl_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/ctrl"
-        model = CTRLModel.from_pretrained(model_name, from_pt=True)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class CTRLModelLanguageGenerationTest(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-
-    @slow
-    def test_lm_generate_ctrl(self):
-        model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")
-        input_ids = mindspore.tensor(
-            [[11859, 0, 1611, 8]], dtype=mindspore.int64
-        )  # Legal the president is
-        expected_output_ids = [
-            11859,
-            0,
-            1611,
-            8,
-            5,
-            150,
-            26449,
-            2,
-            19,
-            348,
-            469,
-            3,
-            2595,
-            48,
-            20740,
-            246533,
-            246533,
-            19,
-            30,
-            5,
-        ]  # Legal the president is a good guy and I don't want to lose my job. \n \n I have a
-
-        output_ids = model.generate(input_ids, do_sample=False)
-        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
diff --git a/tests/transformers/models/cvt/__init__.py b/tests/transformers/models/cvt/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/cvt/test_modeling_cvt.py b/tests/transformers/models/cvt/test_modeling_cvt.py
deleted file mode 100644
index 08c2d67f8..000000000
--- a/tests/transformers/models/cvt/test_modeling_cvt.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore CvT model. """
-
-
-import unittest
-from math import floor
-
-import numpy as np
-from mindnlp.transformers import CvtConfig
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import CvtForImageClassification, CvtModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import AutoImageProcessor
-
-
-class CvtConfigTester(ConfigTester):
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, "embed_dim"))
-        self.parent.assertTrue(hasattr(config, "num_heads"))
-
-
-class CvtModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=64,
-        num_channels=3,
-        embed_dim=[16, 32, 48],
-        num_heads=[1, 2, 3],
-        depth=[1, 2, 10],
-        patch_sizes=[7, 3, 3],
-        patch_stride=[4, 2, 2],
-        patch_padding=[2, 1, 1],
-        stride_kv=[2, 2, 2],
-        cls_token=[False, False, True],
-        attention_drop_rate=[0.0, 0.0, 0.0],
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        is_training=True,
-        use_labels=True,
-        num_labels=2,  # Check
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_sizes = patch_sizes
-        self.patch_stride = patch_stride
-        self.patch_padding = patch_padding
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.num_labels = num_labels
-        self.num_channels = num_channels
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.stride_kv = stride_kv
-        self.depth = depth
-        self.cls_token = cls_token
-        self.attention_drop_rate = attention_drop_rate
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return CvtConfig(
-            image_size=self.image_size,
-            num_labels=self.num_labels,
-            num_channels=self.num_channels,
-            embed_dim=self.embed_dim,
-            num_heads=self.num_heads,
-            patch_sizes=self.patch_sizes,
-            patch_padding=self.patch_padding,
-            patch_stride=self.patch_stride,
-            stride_kv=self.stride_kv,
-            depth=self.depth,
-            cls_token=self.cls_token,
-            attention_drop_rate=self.attention_drop_rate,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = CvtModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        image_size = (self.image_size, self.image_size)
-        height, width = image_size[0], image_size[1]
-        for i in range(len(self.depth)):
-            height = floor(((height + 2 * self.patch_padding[i] - self.patch_sizes[i]) / self.patch_stride[i]) + 1)
-            width = floor(((width + 2 * self.patch_padding[i] - self.patch_sizes[i]) / self.patch_stride[i]) + 1)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.embed_dim[-1], height, width))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = CvtForImageClassification(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class CvtModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Cvt does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (CvtModel, CvtForImageClassification) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"image-feature-extraction": CvtModel, "image-classification": CvtForImageClassification}
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_torchscript = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = CvtModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=CvtConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    @unittest.skip(reason="Cvt does not output attentions")
-    def test_attention_outputs(self):
-        pass
-
-    @unittest.skip(reason="Cvt does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Cvt does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_layers = len(self.model_tester.depth)
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            # verify the first hidden states (first block)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-3:]),
-                [
-                    self.model_tester.embed_dim[0],
-                    self.model_tester.image_size // 4,
-                    self.model_tester.image_size // 4,
-                ],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/cvt-13"
-        model = CvtModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class CvtModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("microsoft/cvt-13")
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = CvtForImageClassification.from_pretrained("microsoft/cvt-13")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([0.9285, 0.9015, -0.3150])
-        print(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy())
-        self.assertTrue(np.allclose(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/data2vec/__init__.py b/tests/transformers/models/data2vec/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/data2vec/test_modeling_data2vec_audio.py b/tests/transformers/models/data2vec/test_modeling_data2vec_audio.py
deleted file mode 100644
index c53c1169e..000000000
--- a/tests/transformers/models/data2vec/test_modeling_data2vec_audio.py
+++ /dev/null
@@ -1,723 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the Mindnlp Data2VecAudio model."""
-
-import math
-import unittest
-import numpy as np
-from datasets import load_dataset
-from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
-#from tests.ut.transformers.models.data2vec.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
-
-from mindnlp.transformers import Data2VecAudioConfig
-from mindnlp.utils.testing_utils import   require_mindspore, slow,is_mindspore_available,require_soundfile
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init
-
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, optim, value_and_grad
-
-    from mindnlp.transformers import (
-        Data2VecAudioForAudioFrameClassification,
-        Data2VecAudioForCTC,
-        Data2VecAudioForSequenceClassification,
-        Data2VecAudioForXVector,
-        Data2VecAudioModel,
-        Wav2Vec2Processor,
-    )
-    from mindnlp.transformers.models.data2vec.modeling_data2vec_audio import _compute_mask_indices
-
-class Data2VecAudioModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=1024,  # speech is longer
-        is_training=False,
-        hidden_size=16,
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        hidden_dropout_prob=0.1,
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        mask_time_prob=0.5,
-        mask_time_length=2,
-        vocab_size=32,
-        num_adapter_layers=1,
-        adapter_stride=2,
-        tdnn_dim=(32, 32),
-        tdnn_kernel=(5, 3),
-        tdnn_dilation=(1, 2),
-        xvector_output_dim=32,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.num_adapter_layers = num_adapter_layers
-        self.adapter_stride = adapter_stride
-        self.mask_time_prob = mask_time_prob
-        self.mask_time_length = mask_time_length
-        self.scope = scope
-        self.tdnn_dim = tdnn_dim
-        self.tdnn_kernel = tdnn_kernel
-        self.tdnn_dilation = tdnn_dilation
-        self.xvector_output_dim = xvector_output_dim
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-        self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length], scale=1.0)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-        return config, input_values, attention_mask
-
-    def get_config(self):
-        return Data2VecAudioConfig(
-            hidden_size=self.hidden_size,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            mask_time_prob=self.mask_time_prob,
-            mask_time_length=self.mask_time_length,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            num_adapter_layers=self.num_adapter_layers,
-            adapter_stride=self.adapter_stride,
-            tdnn_dim=self.tdnn_dim,
-            tdnn_kernel=self.tdnn_kernel,
-            tdnn_dilation=self.tdnn_dilation,
-            xvector_output_dim=self.xvector_output_dim,
-        )
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = Data2VecAudioModel(config=config)
-        
-        model.set_train(False)
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        model = Data2VecAudioModel(config=config)
-        
-        model.set_train(False)
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 8
-        model = Data2VecAudioModel(config=config)
-        
-        model.set_train(False)
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = Data2VecAudioModel(config=config)
-        
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.bool)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(np.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = Data2VecAudioForCTC(config=config)
-        
-
-        # make sure that dropout is disabled
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels).item() - 1), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_seq_classifier_loss(self, config, input_values, *args):
-        model = Data2VecAudioForSequenceClassification(config=config)
-        
-
-        # make sure that dropout is disabled
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-        unmasked_loss = model(input_values, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-    #@unittest.skip('ignore train temporarily')
-    def check_ctc_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForCTC(config=config)
-        
-        model.set_train(True)
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels).item() - 2), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lengths are at least
-                # one shorter than logit lengths to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-        def forward_fn(input_values, labels):
-            loss = model(input_values, labels=labels).loss
-            return loss
-        optimizer = optim.Adam(model.trainable_params(), lr=0.001)
-        grad_fn = value_and_grad(forward_fn, model.trainable_params())
-        loss,grads = grad_fn(input_values,labels)
-        #loss.backward()
-        optimizer.step()
-        #loss.backward()
-
-    def check_seq_classifier_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForSequenceClassification(config=config)
-        
-        model.set_train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-        def forward_fn(input_values, labels):
-            loss = model(input_values, labels=labels).loss
-            return loss
-        optimizer = optim.Adam(model.trainable_params(), lr=0.001)
-        grad_fn = value_and_grad(forward_fn, model.trainable_params())
-        loss = grad_fn(input_values,labels)
-        optimizer.step()
-    
-    def check_xvector_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Data2VecAudioForXVector(config=config)
-        
-        model.set_train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        #loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = Data2VecAudioForCTC(config)
-        
-        model.set_train()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels).item() - 2), model.config.vocab_size + 100)
-
-        with self.parent.assertRaises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class Data2VecAudioModelTest(ModelTesterMixin, unittest.TestCase):#PipelineTesterMixin
-    all_model_classes = (
-        (
-            Data2VecAudioForCTC,
-            Data2VecAudioModel,
-            Data2VecAudioForSequenceClassification,
-            Data2VecAudioForAudioFrameClassification,
-            Data2VecAudioForXVector,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "audio-classification": Data2VecAudioForSequenceClassification,
-            "automatic-speech-recognition": Data2VecAudioForCTC,
-            "feature-extraction": Data2VecAudioModel,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_headmasking = False
-
-    def setUp(self):
-        self.model_tester = Data2VecAudioModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Data2VecAudioConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
-
-    def test_model_with_adapter_proj_dim(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-    def test_model_get_set_embeddings(self):
-        pass
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_xvector_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_xvector_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    @unittest.skip(reason="Data2VecAudio has no inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="`input_ids` is renamed to `input_values`")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip(reason="Data2VecAudio has no tokens embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Data2VecAudio has no inputs_embeds")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # @is_pt_flax_cross_test
-    # # non-robust architecture does not exist in Flax
-    # def test_equivalence_flax_to_pt(self):
-    #     pass
-
-    # @is_pt_flax_cross_test
-    # # non-robust architecture does not exist in Flax
-    # def test_equivalence_pt_to_flax(self):
-    #     pass
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    def test_mask_feature_prob_ctc(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-data2vec-seq-class", mask_feature_prob=0.2, mask_feature_length=2
-        )
-        model.set_train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = Data2VecAudioForCTC.from_pretrained(
-            "facebook/data2vec-audio-base-960h", mask_time_prob=0.2, mask_time_length=2
-        )
-        model.set_train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 299, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = Data2VecAudioModel.from_pretrained("facebook/data2vec-audio-base")
-        self.assertIsNotNone(model)
-
-
-#@require_torch
-class Data2VecAudioUtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = ops.from_numpy(mask)
-
-        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
-
-    def test_compute_mask_indices_low_prob(self):
-        # with these settings num_masked_spans=0.5, which means probabilistic rounding
-        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
-        # the other 5 out of 10, cases num_masked_spans=1
-        n_trials = 100
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        count_dimensions_masked = 0
-        count_dimensions_not_masked = 0
-
-        for _ in range(n_trials):
-            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-            mask = ops.from_numpy(mask)
-
-            num_masks = ops.sum(mask).item()
-
-            if num_masks > 0:
-                count_dimensions_masked += 1
-            else:
-                count_dimensions_not_masked += 1
-
-        # as we test for at least 10 masked dimension and at least
-        # 10 non-masked dimension, this test could fail with probability:
-        # P(100 coin flips, at most 9 heads) = 1.66e-18
-        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
-        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = ops.from_numpy(mask)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-    def test_compute_mask_indices_attn_mask_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        attention_mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-        attention_mask[:2, sequence_length // 2 :] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
-        )
-        mask = ops.from_numpy(mask)
-
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
-
-    def test_compute_mask_indices_short_audio(self):
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        attention_mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-        # force one example to be heavily padded
-        attention_mask[0, 5:] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
-        )
-
-        # make sure that non-padded examples cannot be padded
-        self.assertFalse(mask[0][attention_mask[0].astype(mindspore.bool_).asnumpy()].any())
-
-
-@require_mindspore
-#@require_soundfile
-@slow
-class Data2VecAudioModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset(
-            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
-        )
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
-
-        return ds[:num_samples]
-
-    def test_inference_ctc_normal(self):
-        model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h")
-        
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
-        input_speech = self._load_datasamples(1)
-
-        input_values = processor(input_speech, return_tensors="ms").input_values
-
-        #with F.stop_gradient():
-        logits = model(input_values).logits
-
-        predicted_ids = ops.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_batched(self):
-        model = Data2VecAudioForCTC.from_pretrained("facebook/data2vec-audio-base-960h")
-        processor = Wav2Vec2Processor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-
-        #with F.stop_gradient():
-        logits = model(input_values).logits
-
-        predicted_ids = ops.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around"
-            " him with thousands of spectators were trivialities not worth thinking about",
-            "his instant of panic was followed by a small sharp blow high on his chest",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
diff --git a/tests/transformers/models/data2vec/test_modeling_data2vec_text.py b/tests/transformers/models/data2vec/test_modeling_data2vec_text.py
deleted file mode 100644
index dbf5a7c71..000000000
--- a/tests/transformers/models/data2vec/test_modeling_data2vec_text.py
+++ /dev/null
@@ -1,700 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Data2VecAudio model."""
-
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import Data2VecTextConfig
-from mindnlp.utils.testing_utils import (
-    TestCasePlus,
-    is_mindspore_available,
-    require_mindspore,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import (
-        Data2VecTextForCausalLM,
-        Data2VecTextForMaskedLM,
-        Data2VecTextForMultipleChoice,
-        Data2VecTextForQuestionAnswering,
-        Data2VecTextForSequenceClassification,
-        Data2VecTextForTokenClassification,
-        Data2VecTextModel,
-    )
-    from mindnlp.transformers.models.data2vec.modeling_data2vec_text import (
-        Data2VecTextForTextEmbeddings,
-        create_position_ids_from_input_ids,
-    )
-
-
-class Data2VecTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor(
-                [self.batch_size, self.seq_length], self.type_vocab_size
-            )
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor(
-                [self.batch_size], self.type_sequence_label_size
-            )
-            token_labels = ids_tensor(
-                [self.batch_size, self.seq_length], self.num_labels
-            )
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return Data2VecTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor(
-            [self.batch_size, self.seq_length, self.hidden_size]
-        )
-        encoder_attention_mask = ids_tensor(
-            [self.batch_size, self.seq_length], vocab_size=2
-        )
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = Data2VecTextModel(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids
-        )
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-        self.parent.assertEqual(
-            result.pooler_output.shape, (self.batch_size, self.hidden_size)
-        )
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Data2VecTextModel(config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(
-            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids
-        )
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-        self.parent.assertEqual(
-            result.pooler_output.shape, (self.batch_size, self.hidden_size)
-        )
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Data2VecTextForCausalLM(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)
-        )
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Data2VecTextForCausalLM(config=config)
-        model.set_train(False)
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(
-            np.allclose(
-                output_from_past_slice.asnumpy(),
-                output_from_no_past_slice.asnumpy(),
-                atol=1e-3,
-            )
-        )
-
-    def create_and_check_for_masked_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = Data2VecTextForMaskedLM(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)
-        )
-
-    def create_and_check_for_token_classification(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = Data2VecTextForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)
-        )
-
-    def create_and_check_for_multiple_choice(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_choices = self.num_choices
-        model = Data2VecTextForMultipleChoice(config=config)
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to(
-            (-1, self.num_choices, -1)
-        )
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to(
-            (-1, self.num_choices, -1)
-        )
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to(
-            (-1, self.num_choices, -1)
-        )
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_choices)
-        )
-
-    def create_and_check_for_question_answering(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = Data2VecTextForQuestionAnswering(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(
-            result.start_logits.shape, (self.batch_size, self.seq_length)
-        )
-        self.parent.assertEqual(
-            result.end_logits.shape, (self.batch_size, self.seq_length)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class Data2VecTextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            Data2VecTextForCausalLM,
-            Data2VecTextForMaskedLM,
-            Data2VecTextModel,
-            Data2VecTextForSequenceClassification,
-            Data2VecTextForTokenClassification,
-            Data2VecTextForMultipleChoice,
-            Data2VecTextForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (Data2VecTextForCausalLM,) if is_mindspore_available() else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": Data2VecTextModel,
-            "fill-mask": Data2VecTextForMaskedLM,
-            "question-answering": Data2VecTextForQuestionAnswering,
-            "text-classification": Data2VecTextForSequenceClassification,
-            "text-generation": Data2VecTextForCausalLM,
-            "token-classification": Data2VecTextForTokenClassification,
-            "zero-shot": Data2VecTextForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    model_split_percents = [0.5, 0.9]
-
-    def setUp(self):
-        self.model_tester = Data2VecTextModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=Data2VecTextConfig, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(
-            *config_and_inputs
-        )
-
-    def test_decoder_model_past_with_large_inputs_relative_pos_emb(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        config_and_inputs[0].position_embedding_type = "relative_key"
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(
-            *config_and_inputs
-        )
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/data2vec-text-base"
-        model = Data2VecTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_create_position_ids_respects_padding_index(self):
-        """This is a regression test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = Data2VecTextForTextEmbeddings(config=config)
-
-        input_ids = mindspore.tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = mindspore.tensor(
-            [
-                [
-                    0 + model.padding_idx + 1,
-                    1 + model.padding_idx + 1,
-                    2 + model.padding_idx + 1,
-                    model.padding_idx,
-                ]
-            ]
-        )
-
-        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(ops.all(ops.eq(position_ids, expected_positions)))
-
-    def test_create_position_ids_from_inputs_embeds(self):
-        """This is a regression test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is Data2VecTextForTextEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = Data2VecTextForTextEmbeddings(config=config)
-
-        inputs_embeds = mindspore.numpy.empty((2, 4, 30))
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = mindspore.tensor(
-            [expected_single_positions, expected_single_positions]
-        )
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(ops.all(ops.eq(position_ids, expected_positions)))
-
-
-@require_mindspore
-class Data2VecTextModelIntegrationTest(TestCasePlus):
-    @slow
-    def test_inference_masked_lm(self):
-        model = Data2VecTextForMaskedLM.from_pretrained("facebook/data2vec-text-base")
-
-        input_ids = mindspore.tensor(
-            [[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]
-        )
-        output = model(input_ids)[0]
-        expected_shape = (1, 11, 50265)
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = mindspore.tensor(
-            [
-                [
-                    [0.2328, 0.0000, 1.1710],
-                    [2.2525, 0.0000, 1.9937],
-                    [2.1280, 0.0000, 1.8691],
-                ]
-            ]
-        )
-
-        self.assertTrue(
-            np.allclose(
-                output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4
-            )
-        )
-
-    @slow
-    def test_inference_no_head(self):
-        model = Data2VecTextModel.from_pretrained("facebook/data2vec-text-base")
-
-        input_ids = mindspore.tensor(
-            [[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]
-        )
-
-        output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = mindspore.tensor(
-            [
-                [
-                    [0.1998, -0.0379, 0.0024],
-                    [-0.0971, -0.2214, -0.1798],
-                    [-0.0789, -0.2400, -0.1898],
-                ]
-            ]
-        )
-
-        self.assertTrue(
-            np.allclose(
-                output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4
-            )
-        )
diff --git a/tests/transformers/models/data2vec/test_modeling_data2vec_vision.py b/tests/transformers/models/data2vec/test_modeling_data2vec_vision.py
deleted file mode 100644
index 2c7e9af48..000000000
--- a/tests/transformers/models/data2vec/test_modeling_data2vec_vision.py
+++ /dev/null
@@ -1,361 +0,0 @@
-
-import unittest
-
-import unittest
-import numpy as np
-from mindnlp.transformers import Data2VecVisionConfig
-from mindnlp.utils import cached_property
-from mindnlp.utils.testing_utils import (
-    TestCasePlus,
-    is_mindspore_available,
-    is_vision_available,
-    require_mindspore,
-    slow, require_vision,
-)
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-
-
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import (
-        Data2VecVisionForImageClassification,
-        Data2VecVisionForSemanticSegmentation,
-        Data2VecVisionModel,
-    )
-    from mindnlp.transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import BeitImageProcessor
-
-
-class Data2VecVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=100,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-        out_indices=[0, 1, 2, 3],
-    ):
-        self.parent = parent
-        self.vocab_size = 100
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.out_indices = out_indices
-        self.num_labels = num_labels
-
-        # in BeiT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        pixel_labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels, pixel_labels
-
-    def get_config(self):
-        return Data2VecVisionConfig(
-            vocab_size=self.vocab_size,
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            out_indices=self.out_indices,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
-        model = Data2VecVisionModel(config=config)
-
-        model.set_train(False)
-        result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (self.image_size // self.patch_size) ** 2
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.type_sequence_label_size
-        model = Data2VecVisionForImageClassification(config)
-
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def create_and_check_for_image_segmentation(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.num_labels
-        model = Data2VecVisionForSemanticSegmentation(config)
-
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size * 2, self.image_size * 2)
-        )
-        result = model(pixel_values, labels=pixel_labels)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size * 2, self.image_size * 2)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels, pixel_labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class Data2VecVisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Data2VecVision does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (Data2VecVisionModel, Data2VecVisionForImageClassification, Data2VecVisionForSemanticSegmentation)
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "image-feature-extraction": Data2VecVisionModel,
-            "image-classification": Data2VecVisionForImageClassification,
-            "image-segmentation": Data2VecVisionForSemanticSegmentation,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = Data2VecVisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=Data2VecVisionConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="Data2VecVision does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-
-    @unittest.skip(
-        reason="Data2VecVision has some layers using `add_module` which doesn't work well with `nn.DataParallel`"
-    )
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_segmentation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_segmentation(*config_and_inputs)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="model_tester.is_training is set to False")
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ in MODEL_MAPPING_NAMES.values():
-                continue
-
-            model = model_class(config)
-
-            model.set_train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-
-
-
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                # we skip lambda parameters as these require special initial values
-                # determined by config.layer_scale_init_value
-                if "lambda" in name:
-                    continue
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/data2vec-vision-base-ft1k"
-        model = Data2VecVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-import pathlib
-import os
-import inspect
-def get_tests_dir(append_path=None):
-    """
-    Args:
-        append_path: optional path to append to the tests dir path
-
-    Return:
-        The full path to the `tests` dir, so that the tests can be invoked from anywhere. Optionally `append_path` is
-        joined after the `tests` dir the former is provided.
-
-    """
-    # this function caller's __file__
-    caller__file__ = inspect.stack()[1][1]
-    tests_dir = os.path.abspath(os.path.dirname(caller__file__))
-
-    while not tests_dir.endswith("tests"):
-        tests_dir = os.path.dirname(tests_dir)
-
-    if append_path:
-        return os.path.join(tests_dir, append_path)
-    return tests_dir
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    fixtures_path = pathlib.Path(get_tests_dir()) / 'fixtures/tests_samples/COCO'
-    image = Image.open(fixtures_path / "000000039769.png")
-    return image
-
-
-
-@require_vision
-class Data2VecVisionModelIntegrationTest(unittest.TestCase):
-
-    @cached_property
-    def default_image_processor(self):
-        return (
-            BeitImageProcessor.from_pretrained("facebook/data2vec-vision-base-ft1k") if is_vision_available() else None
-        )
-
-    @slow
-    def test_inference_image_classification_head_imagenet_1k(self):
-        model = Data2VecVisionForImageClassification.from_pretrained("facebook/data2vec-vision-base-ft1k")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-
-        outputs = model(**inputs)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([0.3277, -0.1395, 0.0911])
-
-        self.assertTrue(ops.allclose(logits[0, :3], expected_slice, atol=1e-4))
-
-        expected_top2 = [model.config.label2id[i] for i in ["remote control, remote", "tabby, tabby cat"]]
-        #print(type(logits[0].topk(2)))
-        self.assertEqual(logits[0].topk(2)[1].tolist(), expected_top2)
-
-    @slow
-    def test_inference_interpolate_pos_encoding(self):
-        model_name = "facebook/data2vec-vision-base-ft1k"
-        model = Data2VecVisionModel.from_pretrained(model_name, **{"use_absolute_position_embeddings": True})
-
-        image = prepare_img()
-        processor = BeitImageProcessor.from_pretrained("facebook/data2vec-vision-base-ft1k")
-        inputs = processor(images=image, return_tensors="ms", size={"height": 480, "width": 480})
-        pixel_values = inputs.pixel_values
-
-        # with interpolate_pos_encoding being False an exception should be raised with higher resolution
-        # images than what the model supports.
-        self.assertFalse(processor.do_center_crop)
-
-        '''with self.assertRaises(ValueError, msg="doesn't match model"):
-            model(pixel_values, interpolate_pos_encoding=False)'''
-
-        # with interpolate_pos_encoding being True the model should process the higher resolution image
-        # successfully and produce the expected output.
-
-        outputs = model(pixel_values, interpolate_pos_encoding=True)
-
-        expected_shape = (1, 1801, 768)
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
\ No newline at end of file
diff --git a/tests/transformers/models/dbrx/__init__.py b/tests/transformers/models/dbrx/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/dbrx/test_modeling_dbrx.py b/tests/transformers/models/dbrx/test_modeling_dbrx.py
deleted file mode 100644
index 8fa6b8aba..000000000
--- a/tests/transformers/models/dbrx/test_modeling_dbrx.py
+++ /dev/null
@@ -1,399 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch DBRX model."""
-
-import unittest
-import numpy as np
-from mindnlp.transformers import DbrxConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.transformers import DbrxForCausalLM, DbrxModel
-    from mindnlp.core import ops
-
-class DbrxModelTester:
-    def __init__(
-        self,
-        parent,
-        hidden_size=32,
-        ffn_hidden_size=32,
-        num_attention_heads=4,
-        kv_n_heads=4,
-        num_hidden_layers=5,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        use_cache=True,
-        type_sequence_label_size=2,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        clip_qkv=8,
-        rope_theta=500000,
-        attn_config_model_type="",
-        emb_pdrop=0.0,
-        moe_jitter_eps=0,
-        moe_loss_weight=0.05,
-        moe_num_experts=16,
-        moe_top_k=4,
-        ffn_config_model_type="",
-        ffn_act_fn_name="gelu",
-        initializer_range=0.02,
-        output_router_logits=False,
-        resid_pdrop=0.0,
-        tie_word_embeddings=False,
-        torch_dtype="bfloat16",
-        vocab_size=99,
-        is_decoder=True,
-        pad_token_id=0,
-    ):
-        # Parameters unique to testing
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.parent = parent
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-
-        # attn_config params
-        self.clip_qkv = clip_qkv
-        self.kv_n_heads = kv_n_heads
-        self.rope_theta = rope_theta
-        self.attn_config_model_type = attn_config_model_type
-
-        # ffn_config params
-        self.ffn_hidden_size = ffn_hidden_size
-        self.moe_jitter_eps = moe_jitter_eps
-        self.moe_loss_weight = moe_loss_weight
-        self.moe_num_experts = moe_num_experts
-        self.moe_top_k = moe_top_k
-        self.ffn_config_model_type = ffn_config_model_type
-        self.ffn_act_fn_name = ffn_act_fn_name
-
-        # Other model params
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.max_position_embeddings = max_position_embeddings
-        self.vocab_size = vocab_size
-        self.use_cache = use_cache
-        self.initializer_range = initializer_range
-        self.emb_pdrop = emb_pdrop
-        self.output_router_logits = output_router_logits
-        self.resid_pdrop = resid_pdrop
-        self.tie_word_embeddings = tie_word_embeddings
-        self.torch_dtype = torch_dtype
-        self.is_decoder = is_decoder
-        self.pad_token_id = pad_token_id
-
-        # Make the dictionaries
-        self.ffn_config = {
-            "ffn_hidden_size": self.ffn_hidden_size,
-            "moe_jitter_eps": self.moe_jitter_eps,
-            "moe_loss_weight": self.moe_loss_weight,
-            "moe_num_experts": self.moe_num_experts,
-            "moe_top_k": self.moe_top_k,
-            "model_type": self.ffn_config_model_type,
-            "ffn_act_fn": {"name": self.ffn_act_fn_name},
-        }
-        self.attn_config = {
-            "clip_qkv": self.clip_qkv,
-            "kv_n_heads": self.kv_n_heads,
-            "model_type": self.attn_config_model_type,
-            "rope_theta": self.rope_theta,
-        }
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor((self.batch_size, self.seq_length), self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        # Behind the scenes, `DbrxConfig` maps the parameters `hidden_size`, `num_hidden_layers`,
-        # `num_attention_heads`, `max_position_embeddings` to the parameters `d_model`, `n_layers`,
-        # `n_heads`, `max_seq_len` respectively. We use the first group of parameters because
-        # other tests expect every model to have these parameters with these specific names.
-        config = DbrxConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,  # mapped to `d_model`
-            num_hidden_layers=self.num_hidden_layers,  # mapped to `n_layers`
-            num_attention_heads=self.num_attention_heads,  # mapped to `n_heads`
-            max_position_embeddings=self.max_position_embeddings,  # mapped to `max_seq_len`
-            attn_config=self.attn_config,
-            ffn_config=self.ffn_config,
-            resid_pdrop=self.resid_pdrop,
-            emb_pdrop=self.emb_pdrop,
-            use_cache=self.use_cache,
-            initializer_range=self.initializer_range,
-            output_router_logits=self.output_router_logits,
-            is_decoder=self.is_decoder,
-            pad_token_id=self.pad_token_id,
-        )
-        return config
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Dbrx
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DbrxModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Dbrx
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = DbrxModel(config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Dbrx
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = DbrxForCausalLM(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = DbrxForCausalLM(config=config)
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Dbrx
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class DbrxModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (DbrxModel, DbrxForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (DbrxForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"text-generation": DbrxForCausalLM} if is_mindspore_available() else {}
-    test_headmasking = False
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = DbrxModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DbrxConfig, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "eitanturok/dbrx-tiny"
-        model = DbrxModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Dbrx models have weight tying disabled.")
-    def test_tied_weights_keys(self):
-        pass
-
-    # Offload does not work with Dbrx models because of the forward of DbrxExperts where we chunk the experts.
-    # The issue is that the offloaded weights of the mlp layer are still on meta device (w1_chunked, v1_chunked, w2_chunked)
-    @unittest.skip(reason="Dbrx models do not work with offload")
-    def test_cpu_offload(self):
-        pass
-
-    @unittest.skip(reason="Dbrx models do not work with offload")
-    def test_disk_offload_safetensors(self):
-        pass
-
-    @unittest.skip(reason="Dbrx models do not work with offload")
-    def test_disk_offload_bin(self):
-        pass
-
-    @unittest.skip(reason="Dbrx models do not work with offload")
-    def test_beam_sample_generate(self):
-        pass
-
-    @unittest.skip(reason="Dbrx models do not work with offload")
-    def test_generate_from_inputs_embeds_decoder_only(self):
-        pass
-
-    @unittest.skip(reason="Dbrx models do not work with offload")
-    def test_generate_compile_fullgraph(self):
-        pass
-
-
-@require_mindspore
-class DbrxModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_tiny_model_logits(self):
-        model = DbrxForCausalLM.from_pretrained("Rocketknight1/dbrx-tiny-random")
-        input_ids = mindspore.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-        vocab_size = model.vocab_size
-
-        expected_shape = (1, 6, vocab_size)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [
-                [
-                    [-1.6300e-04, 5.0118e-04, 2.5437e-04],
-                    [2.0422e-05, 2.7210e-04, -1.5125e-04],
-                    [-1.5105e-04, 4.6879e-04, 3.3309e-04],
-                ]
-            ]
-        )
-
diff --git a/tests/transformers/models/deberta/__init__.py b/tests/transformers/models/deberta/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/deberta/test_modeling_deberta.py b/tests/transformers/models/deberta/test_modeling_deberta.py
deleted file mode 100644
index 89243b3ec..000000000
--- a/tests/transformers/models/deberta/test_modeling_deberta.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Microsoft Authors and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-from mindnlp.transformers import DebertaConfig, is_mindspore_available
-from mindnlp.utils.testing_utils import require_sentencepiece, require_tokenizers, require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        DebertaForMaskedLM,
-        DebertaForQuestionAnswering,
-        DebertaForSequenceClassification,
-        DebertaForTokenClassification,
-        DebertaModel,
-    )
-
-
-class DebertaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        relative_attention=False,
-        position_biased_input=True,
-        pos_att_type="None",
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.relative_attention = relative_attention
-        self.position_biased_input = position_biased_input
-        self.pos_att_type = pos_att_type
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return DebertaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            relative_attention=self.relative_attention,
-            position_biased_input=self.position_biased_input,
-            pos_att_type=self.pos_att_type,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def check_loss_output(self, result):
-        self.parent.assertListEqual(list(result.loss.shape), [])
-
-    def create_and_check_deberta_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DebertaModel(config=config)
-        model.eval()
-        sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0]
-        sequence_output = model(input_ids, token_type_ids=token_type_ids)[0]
-        sequence_output = model(input_ids)[0]
-
-        self.parent.assertListEqual(list(sequence_output.shape), [self.batch_size, self.seq_length, self.hidden_size])
-
-    def create_and_check_deberta_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DebertaForMaskedLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_deberta_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = DebertaForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertListEqual(list(result.logits.shape), [self.batch_size, self.num_labels])
-        self.check_loss_output(result)
-
-    def create_and_check_deberta_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = DebertaForTokenClassification(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_deberta_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DebertaForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            DebertaModel,
-            DebertaForMaskedLM,
-            DebertaForSequenceClassification,
-            DebertaForTokenClassification,
-            DebertaForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": DebertaModel,
-            "fill-mask": DebertaForMaskedLM,
-            "question-answering": DebertaForQuestionAnswering,
-            "text-classification": DebertaForSequenceClassification,
-            "token-classification": DebertaForTokenClassification,
-            "zero-shot": DebertaForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    fx_compatible = True
-    test_torchscript = False
-    test_pruning = False
-    test_head_masking = False
-    is_encoder_decoder = False
-
-    def setUp(self):
-        self.model_tester = DebertaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DebertaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_deberta_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_deberta_model(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_deberta_for_sequence_classification(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_deberta_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_deberta_for_question_answering(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_deberta_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/deberta-base"
-        model = DebertaModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class DebertaModelIntegrationTest(unittest.TestCase):
-    @unittest.skip(reason="Model not available yet")
-    def test_inference_masked_lm(self):
-        pass
-
-    @slow
-    def test_inference_no_head(self):
-        model = DebertaModel.from_pretrained("microsoft/deberta-base")
-
-        input_ids = mindspore.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        attention_mask = mindspore.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        with no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        # compare the actual values for a slice.
-        expected_slice = mindspore.tensor(
-            [[[-0.5986, -0.8055, -0.8462], [1.4484, -0.9348, -0.8059], [0.3123, 0.0032, -1.4131]]]
-        )
-        self.assertTrue(ops.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4), f"{output[:, 1:4, 1:4]}")
\ No newline at end of file
diff --git a/tests/transformers/models/deberta_v2/__init__.py b/tests/transformers/models/deberta_v2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/deberta_v2/test_modeling_deberta_v2.py b/tests/transformers/models/deberta_v2/test_modeling_deberta_v2.py
deleted file mode 100644
index 7308e6e5b..000000000
--- a/tests/transformers/models/deberta_v2/test_modeling_deberta_v2.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Microsoft Authors and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import numpy as np
-
-from mindnlp.transformers import DebertaV2Config
-from mindnlp.utils.testing_utils import require_sentencepiece, require_tokenizers, require_mindspore, slow, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        DebertaV2ForMaskedLM,
-        DebertaV2ForMultipleChoice,
-        DebertaV2ForQuestionAnswering,
-        DebertaV2ForSequenceClassification,
-        DebertaV2ForTokenClassification,
-        DebertaV2Model,
-        
-    )
-
-
-class DebertaV2ModelTester(object):
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        relative_attention=False,
-        position_biased_input=True,
-        pos_att_type="None",
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.relative_attention = relative_attention
-        self.position_biased_input = position_biased_input
-        self.pos_att_type = pos_att_type
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return DebertaV2Config(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            relative_attention=self.relative_attention,
-            position_biased_input=self.position_biased_input,
-            pos_att_type=self.pos_att_type,
-        )
-
-    def check_loss_output(self, result):
-        self.parent.assertListEqual(list(result.loss.shape), [])
-
-    def create_and_check_deberta_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DebertaV2Model(config=config)
-        model.set_train(False)
-        sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0]
-        sequence_output = model(input_ids, token_type_ids=token_type_ids)[0]
-        sequence_output = model(input_ids)[0]
-
-        self.parent.assertListEqual(list(sequence_output.shape), [self.batch_size, self.seq_length, self.hidden_size])
-
-    def create_and_check_deberta_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DebertaV2ForMaskedLM(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_deberta_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = DebertaV2ForSequenceClassification(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertListEqual(list(result.logits.shape), [self.batch_size, self.num_labels])
-        self.check_loss_output(result)
-
-    def create_and_check_deberta_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = DebertaV2ForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_deberta_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DebertaV2ForQuestionAnswering(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_deberta_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DebertaV2ForMultipleChoice(config=config)
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class DebertaV2ModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            DebertaV2ForMaskedLM,
-            DebertaV2ForSequenceClassification,
-            DebertaV2ForTokenClassification,
-            DebertaV2ForQuestionAnswering,
-            DebertaV2ForMultipleChoice,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": DebertaV2Model,
-            "fill-mask": DebertaV2ForMaskedLM,
-            "question-answering": DebertaV2ForQuestionAnswering,
-            "text-classification": DebertaV2ForSequenceClassification,
-            "token-classification": DebertaV2ForTokenClassification,
-            "zero-shot": DebertaV2ForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    fx_compatible = True
-    test_torchscript = False
-    test_pruning = False
-    test_head_masking = False
-    is_encoder_decoder = False
-
-    def setUp(self):
-        self.model_tester = DebertaV2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DebertaV2Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_deberta_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_deberta_model(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_deberta_for_sequence_classification(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_deberta_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_deberta_for_question_answering(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_deberta_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_deberta_for_multiple_choice(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/deberta-v2-xlarge"
-        model = DebertaV2Model.from_pretrained(model_name,from_pt=True)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class DebertaV2ModelIntegrationTest(unittest.TestCase):
-    @unittest.skip(reason="Model not available yet")
-    def test_inference_masked_lm(self):
-        pass
-
-    @slow
-    def test_inference_no_head(self):
-        model = DebertaV2Model.from_pretrained("microsoft/deberta-v2-xlarge")
-        input_ids = mindspore.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        attention_mask = mindspore.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        with mindspore._no_grad():
-            output = model(input_ids, attention_mask=attention_mask)[0]
-        # compare the actual values for a slice.
-        expected_slice = mindspore.tensor(
-            [[[0.2356, 0.1948, 0.0369], [-0.1063, 0.3586, -0.5152], [-0.6399, -0.0259, -0.2525]]]
-        )
-        self.assertTrue(np.allclose(output[:, 1:4, 1:4].asnumpy(), expected_slice.asnumpy(), atol=2e-4), f"{output[:, 1:4, 1:4]}")
-    
\ No newline at end of file
diff --git a/tests/transformers/models/decision_transformer/__init__.py b/tests/transformers/models/decision_transformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/decision_transformer/test_modeling_decision_transformer.py b/tests/transformers/models/decision_transformer/test_modeling_decision_transformer.py
deleted file mode 100644
index 4c3cab0db..000000000
--- a/tests/transformers/models/decision_transformer/test_modeling_decision_transformer.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch DecisionTransformer model."""
-
-import inspect
-import unittest
-
-import numpy as np
-from mindnlp.transformers import DecisionTransformerConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import DecisionTransformerModel
-
-class DecisionTransformerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        act_dim=6,
-        state_dim=17,
-        hidden_size=23,
-        max_length=11,
-        is_training=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.act_dim = act_dim
-        self.state_dim = state_dim
-        self.hidden_size = hidden_size
-        self.max_length = max_length
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        states = floats_tensor((self.batch_size, self.seq_length, self.state_dim))
-        actions = floats_tensor((self.batch_size, self.seq_length, self.act_dim))
-        rewards = floats_tensor((self.batch_size, self.seq_length, 1))
-        returns_to_go = floats_tensor((self.batch_size, self.seq_length, 1))
-        timesteps = ids_tensor((self.batch_size, self.seq_length), vocab_size=1000)
-        attention_mask = random_attention_mask((self.batch_size, self.seq_length))
-
-        config = self.get_config()
-
-        return (
-            config,
-            states,
-            actions,
-            rewards,
-            returns_to_go,
-            timesteps,
-            attention_mask,
-        )
-
-    def get_config(self):
-        return DecisionTransformerConfig(
-            batch_size=self.batch_size,
-            seq_length=self.seq_length,
-            act_dim=self.act_dim,
-            state_dim=self.state_dim,
-            hidden_size=self.hidden_size,
-            max_length=self.max_length,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        states,
-        actions,
-        rewards,
-        returns_to_go,
-        timesteps,
-        attention_mask,
-    ):
-        model = DecisionTransformerModel(config=config)
-        model.set_train(False)
-        result = model(states, actions, rewards, returns_to_go, timesteps, attention_mask)
-
-        self.parent.assertEqual(result.state_preds.shape, states.shape)
-        self.parent.assertEqual(result.action_preds.shape, actions.shape)
-        self.parent.assertEqual(result.return_preds.shape, returns_to_go.shape)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.seq_length * 3, self.hidden_size)
-        )  # seq length *3 as there are 3 modelities: states, returns and actions
-
-    def prepare_config_and_inputs_for_common(self):
-
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            states,
-            actions,
-            rewards,
-            returns_to_go,
-            timesteps,
-            attention_mask,
-        ) = config_and_inputs
-        inputs_dict = {
-            "states": states,
-            "actions": actions,
-            "rewards": rewards,
-            "returns_to_go": returns_to_go,
-            "timesteps": timesteps,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class DecisionTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = () if is_mindspore_available() else ()
-    all_generative_model_classes = ()
-    pipeline_model_mapping = {"feature-extraction": DecisionTransformerModel} if is_mindspore_available() else {}
-
-    # Ignoring of a failing test from GenerationTesterMixin, as the model does not use inputs_ids
-    test_generate_without_input_ids = False
-
-    # Ignoring of a failing tests from ModelTesterMixin, as the model does not implement these features
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_attention_outputs = False
-    test_hidden_states_output = False
-    test_inputs_embeds = False
-    test_model_get_set_embeddings = False
-    test_gradient_checkpointing = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = DecisionTransformerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DecisionTransformerConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "edbeeching/decision-transformer-gym-hopper-medium"
-        model = DecisionTransformerModel.from_pretrained(model_name, from_pt=True)
-        self.assertIsNotNone(model)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "states",
-                "actions",
-                "rewards",
-                "returns_to_go",
-                "timesteps",
-                "attention_mask",
-            ]
-
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-
-@require_mindspore
-class DecisionTransformerModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_autoregressive_prediction(self):
-        """
-        An integration test that performs autoregressive prediction of state, action and return
-        from a sequence of state, actions and returns. Test is performed over two timesteps.
-
-        """
-
-        NUM_STEPS = 1  # number of steps of autoregressive prediction we will perform
-        TARGET_RETURN = 10  # defined by the RL environment, may be normalized
-        model = DecisionTransformerModel.from_pretrained("edbeeching/decision-transformer-gym-hopper-expert",from_pt=True)
-        config = model.config
-        state = mindspore.tensor([[[1.5409961, -0.2934289, -2.1787894, 0.56843126, -1.0845224,
-                                    -1.3985955, 0.40334684, 0.83802634, -0.7192576, -0.40334353,
-                                    -0.59663534]]])
-        expected_outputs = mindspore.tensor(
-            [[0.242793, -0.28693074, 0.8742613], [0.67815274, -0.08101085, -0.12952147]]
-        )
-
-        returns_to_go = mindspore.tensor(TARGET_RETURN, dtype=mindspore.float32).reshape((1, 1, 1))
-        states = state
-        actions = ops.zeros((1, 0, config.act_dim), dtype=mindspore.float32)
-        rewards = ops.zeros(1, 0, dtype=mindspore.float32)
-        timesteps = mindspore.tensor(0, dtype=mindspore.int64).reshape((1, 1))
-
-        for step in range(NUM_STEPS):
-            actions = ops.cat([actions, ops.zeros((1, 1, config.act_dim))], axis=1)
-            rewards = ops.cat([rewards, ops.zeros((1, 1))], axis=1)
-
-            attention_mask = ops.ones((1, states.shape[1]),dtype=mindspore.int64)
-            with mindspore._no_grad():
-                _, action_pred, _ = model(
-                    states=states,
-                    actions=actions,
-                    rewards=rewards,
-                    returns_to_go=returns_to_go,
-                    timesteps=timesteps,
-                    attention_mask=attention_mask,
-                    return_dict=False,
-                )
-
-            self.assertEqual(action_pred.shape, actions.shape)
-            self.assertTrue(np.allclose(action_pred[0, -1].asnumpy(), expected_outputs[step].asnumpy(), atol=1e-4))
-            new_uniform=[[[0.9398099, 0.7748488, 0.19186942, 1.2637948, -1.2904351,
-                            -0.7911027, -0.02087947, -0.71848005, 0.51863676, -1.3125219,
-                            0.19199507]]]
-            state, reward, _, _ = (  # env.step(action)
-                new_uniform,
-                1.0,
-                False,
-                {},
-            )
-            actions[-1] = action_pred[0, -1]
-            states = ops.cat([states, state], axis=1)
-            pred_return = returns_to_go[0, -1] - reward
-            returns_to_go = ops.cat([returns_to_go, pred_return.reshape((1, 1, 1))], axis=1)
-            timesteps = ops.cat(
-                [timesteps, ops.ones((1, 1), dtype=mindspore.int64) * (step + 1)], axis=1
-            )
diff --git a/tests/transformers/models/deepseek_v2/__init__.py b/tests/transformers/models/deepseek_v2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/deepseek_v2/testing_deepseek_v2.py b/tests/transformers/models/deepseek_v2/testing_deepseek_v2.py
deleted file mode 100644
index fcae899aa..000000000
--- a/tests/transformers/models/deepseek_v2/testing_deepseek_v2.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import mindspore
-from mindnlp.transformers import AutoTokenizer, AutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Base")
-model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Base", ms_dtype=mindspore.float32)
-input_text = "#write a quick sort algorithm"
-inputs = tokenizer(input_text, return_tensors="ms")
-outputs = model.generate(**inputs, max_length=128)
-print(tokenizer.decode(outputs[0], skip_special_tokens=True))
diff --git a/tests/transformers/models/deformable_detr/__init__.py b/tests/transformers/models/deformable_detr/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/transformers/models/deformable_detr/test_image_processing_deformable_detr.py
deleted file mode 100644
index cf922db5d..000000000
--- a/tests/transformers/models/deformable_detr/test_image_processing_deformable_detr.py
+++ /dev/null
@@ -1,756 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import pathlib
-import unittest
-
-import numpy as np
-
-from mindnlp.core import ops
-from mindnlp.utils import is_mindspore_available, is_vision_available
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-
-from ...test_image_processing_common import (
-    AnnotationFormatTestMixin,
-    ImageProcessingTestMixin,
-    prepare_image_inputs,
-)
-
-if is_mindspore_available():
-    import mindspore
-
-if is_vision_available():
-    from PIL import Image
-    from mindnlp.transformers import DeformableDetrImageProcessor
-
-
-class DeformableDetrImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_pad=True,
-    ):
-        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
-        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_pad = do_pad
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_pad": self.do_pad,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to DeformableDetrImageProcessor,
-        assuming do_resize is set to True with a scalar size.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            elif isinstance(image, np.ndarray):
-                h, w = image.shape[0], image.shape[1]
-            else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-    def expected_output_image_shape(self, images):
-        height, width = self.get_expected_values(images, batched=True)
-        return self.num_channels, height, width
-
-    def prepare_image_inputs(
-        self, equal_resolution=False, numpify=False, torchify=False
-    ):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class DeformableDetrImageProcessingTest(
-    AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase
-):
-    image_processing_class = (
-        DeformableDetrImageProcessor if is_vision_available() else None
-    )
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = DeformableDetrImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "do_rescale"))
-        self.assertTrue(hasattr(image_processing, "do_pad"))
-        self.assertTrue(hasattr(image_processing, "size"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict
-        )
-        self.assertEqual(
-            image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}
-        )
-        self.assertEqual(image_processor.do_pad, True)
-
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict,
-            size=42,
-            max_size=84,
-            pad_and_return_pixel_mask=False,
-        )
-        self.assertEqual(
-            image_processor.size, {"shortest_edge": 42, "longest_edge": 84}
-        )
-        self.assertEqual(image_processor.do_pad, False)
-
-    @slow
-    def test_call_pytorch_with_coco_detection_annotations(self):
-        # prepare image and target
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"image_id": 39769, "annotations": target}
-
-        # encode them
-        image_processing = DeformableDetrImageProcessor()
-        encoding = image_processing(
-            images=image, annotations=target, return_tensors="ms"
-        )
-
-        # verify pixel values
-        expected_shape = (1, 3, 800, 1066)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = mindspore.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(
-            np.allclose(
-                encoding["pixel_values"][0, 0, 0, :3].numpy(),
-                expected_slice.numpy(),
-                atol=1e-4,
-            )
-        )
-
-        # verify area
-        expected_area = mindspore.tensor(
-            [5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]
-        )
-        self.assertTrue(
-            np.allclose(encoding["labels"][0]["area"].numpy(), expected_area.numpy())
-        )
-        # verify boxes
-        expected_boxes_shape = (6, 4)
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = mindspore.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["boxes"][0].numpy(),
-                expected_boxes_slice.numpy(),
-                atol=1e-3,
-            )
-        )
-        # verify image_id
-        expected_image_id = mindspore.tensor([39769])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["image_id"].numpy(), expected_image_id.numpy()
-            )
-        )
-        # verify is_crowd
-        expected_is_crowd = mindspore.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["iscrowd"].numpy(), expected_is_crowd.numpy()
-            )
-        )
-        # verify class_labels
-        expected_class_labels = mindspore.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["class_labels"].numpy(),
-                expected_class_labels.numpy(),
-            )
-        )
-        # verify orig_size
-        expected_orig_size = mindspore.tensor([480, 640])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["orig_size"].numpy(), expected_orig_size.numpy()
-            )
-        )
-        # verify size
-        expected_size = mindspore.tensor([800, 1066])
-        self.assertTrue(
-            np.allclose(encoding["labels"][0]["size"].numpy(), expected_size.numpy())
-        )
-
-    @slow
-    def test_call_pytorch_with_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open(
-            "./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r"
-        ) as f:
-            target = json.loads(f.read())
-
-        target = {
-            "file_name": "000000039769.png",
-            "image_id": 39769,
-            "segments_info": target,
-        }
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        # encode them
-        image_processing = DeformableDetrImageProcessor(format="coco_panoptic")
-        encoding = image_processing(
-            images=image, annotations=target, masks_path=masks_path, return_tensors="ms"
-        )
-
-        # verify pixel values
-        expected_shape = (1, 3, 800, 1066)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = mindspore.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(
-            np.allclose(
-                encoding["pixel_values"][0, 0, 0, :3].numpy(),
-                expected_slice.numpy(),
-                atol=1e-4,
-            )
-        )
-
-        # verify area
-        expected_area = mindspore.tensor(
-            [147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]
-        )
-        self.assertTrue(
-            np.allclose(encoding["labels"][0]["area"].numpy(), expected_area.numpy())
-        )
-        # verify boxes
-        expected_boxes_shape = (6, 4)
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = mindspore.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["boxes"][0].numpy(),
-                expected_boxes_slice.numpy(),
-                atol=1e-3,
-            )
-        )
-        # verify image_id
-        expected_image_id = mindspore.tensor([39769])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["image_id"].numpy(), expected_image_id.numpy()
-            )
-        )
-        # verify is_crowd
-        expected_is_crowd = mindspore.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["iscrowd"].numpy(), expected_is_crowd.numpy()
-            )
-        )
-        # verify class_labels
-        expected_class_labels = mindspore.tensor([17, 17, 63, 75, 75, 93])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["class_labels"].numpy(),
-                expected_class_labels.numpy(),
-            )
-        )
-        # verify masks
-        expected_masks_sum = 822873
-        self.assertEqual(
-            encoding["labels"][0]["masks"].sum().item(), expected_masks_sum
-        )
-        # verify orig_size
-        expected_orig_size = mindspore.tensor([480, 640])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["orig_size"].numpy(), expected_orig_size.numpy()
-            )
-        )
-        # verify size
-        expected_size = mindspore.tensor([800, 1066])
-        self.assertTrue(
-            np.allclose(encoding["labels"][0]["size"].numpy(), expected_size.numpy())
-        )
-
-    @slow
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->DeformableDetr
-    def test_batched_coco_detection_annotations(self):
-        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        image_1 = Image.open(
-            "./tests/fixtures/tests_samples/COCO/000000039769.png"
-        ).resize((800, 800))
-
-        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        annotations_0 = {"image_id": 39769, "annotations": target}
-        annotations_1 = {"image_id": 39769, "annotations": target}
-
-        # Adjust the bounding boxes for the resized image
-        w_0, h_0 = image_0.size
-        w_1, h_1 = image_1.size
-        for i in range(len(annotations_1["annotations"])):
-            coords = annotations_1["annotations"][i]["bbox"]
-            new_bbox = [
-                coords[0] * w_1 / w_0,
-                coords[1] * h_1 / h_0,
-                coords[2] * w_1 / w_0,
-                coords[3] * h_1 / h_0,
-            ]
-            annotations_1["annotations"][i]["bbox"] = new_bbox
-
-        images = [image_0, image_1]
-        annotations = [annotations_0, annotations_1]
-
-        image_processing = DeformableDetrImageProcessor()
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            return_tensors="ms",  # do_convert_annotations=True
-        )
-
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1066
-        expected_shape = (2, 3, postprocessed_height, postprocessed_width)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        expected_boxes_0 = mindspore.tensor(
-            [
-                [0.6879, 0.4609, 0.0755, 0.3691],
-                [0.2118, 0.3359, 0.2601, 0.1566],
-                [0.5011, 0.5000, 0.9979, 1.0000],
-                [0.5010, 0.5020, 0.9979, 0.9959],
-                [0.3284, 0.5944, 0.5884, 0.8112],
-                [0.8394, 0.5445, 0.3213, 0.9110],
-            ]
-        )
-        expected_boxes_1 = mindspore.tensor(
-            [
-                [0.4130, 0.2765, 0.0453, 0.2215],
-                [0.1272, 0.2016, 0.1561, 0.0940],
-                [0.3757, 0.4933, 0.7488, 0.9865],
-                [0.3759, 0.5002, 0.7492, 0.9955],
-                [0.1971, 0.5456, 0.3532, 0.8646],
-                [0.5790, 0.4115, 0.3430, 0.7161],
-            ]
-        )
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["boxes"].numpy(),
-                expected_boxes_0.numpy(),
-                rtol=1e-3,
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][1]["boxes"].numpy(),
-                expected_boxes_1.numpy(),
-                rtol=1e-3,
-            )
-        )
-
-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, (6, 800, 1066))
-
-        self.assertEqual(encoding["labels"][1]["masks"].shape, (6, 800, 1066))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="ms",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = ops.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = ops.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = ops.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = ops.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["boxes"].numpy(), expected_boxes_0.numpy(), rtol=1
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][1]["boxes"].numpy(), expected_boxes_1.numpy(), rtol=1
-            )
-        )
-
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->DeformableDetr
-    def test_batched_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        image_1 = Image.open(
-            "./tests/fixtures/tests_samples/COCO/000000039769.png"
-        ).resize((800, 800))
-
-        with open(
-            "./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r"
-        ) as f:
-            target = json.loads(f.read())
-
-        annotation_0 = {
-            "file_name": "000000039769.png",
-            "image_id": 39769,
-            "segments_info": target,
-        }
-        annotation_1 = {
-            "file_name": "000000039769.png",
-            "image_id": 39769,
-            "segments_info": target,
-        }
-
-        w_0, h_0 = image_0.size
-        w_1, h_1 = image_1.size
-        for i in range(len(annotation_1["segments_info"])):
-            coords = annotation_1["segments_info"][i]["bbox"]
-            new_bbox = [
-                coords[0] * w_1 / w_0,
-                coords[1] * h_1 / h_0,
-                coords[2] * w_1 / w_0,
-                coords[3] * h_1 / h_0,
-            ]
-            annotation_1["segments_info"][i]["bbox"] = new_bbox
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        images = [image_0, image_1]
-        annotations = [annotation_0, annotation_1]
-
-        # encode them
-        image_processing = DeformableDetrImageProcessor(format="coco_panoptic")
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_tensors="ms",
-            return_segmentation_masks=True,
-        )
-
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1066
-        expected_shape = (2, 3, postprocessed_height, postprocessed_width)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        expected_boxes_0 = mindspore.tensor(
-            [
-                [0.2625, 0.5437, 0.4688, 0.8625],
-                [0.7719, 0.4104, 0.4531, 0.7125],
-                [0.5000, 0.4927, 0.9969, 0.9854],
-                [0.1688, 0.2000, 0.2063, 0.0917],
-                [0.5492, 0.2760, 0.0578, 0.2187],
-                [0.4992, 0.4990, 0.9984, 0.9979],
-            ]
-        )
-        expected_boxes_1 = mindspore.tensor(
-            [
-                [0.1576, 0.3262, 0.2814, 0.5175],
-                [0.4634, 0.2463, 0.2720, 0.4275],
-                [0.3002, 0.2956, 0.5985, 0.5913],
-                [0.1013, 0.1200, 0.1238, 0.0550],
-                [0.3297, 0.1656, 0.0347, 0.1312],
-                [0.2997, 0.2994, 0.5994, 0.5987],
-            ]
-        )
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["boxes"].numpy(),
-                expected_boxes_0.numpy(),
-                rtol=1e-3,
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][1]["boxes"].numpy(),
-                expected_boxes_1.numpy(),
-                rtol=1e-3,
-            )
-        )
-
-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, (6, 800, 1066))
-
-        self.assertEqual(encoding["labels"][1]["masks"].shape, (6, 800, 1066))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="ms",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = ops.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = ops.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = ops.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = ops.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["boxes"].numpy(), expected_boxes_0.numpy(), rtol=1
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][1]["boxes"].numpy(), expected_boxes_1.numpy(), rtol=1
-            )
-        )
-
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->DeformableDetr
-    def test_max_width_max_height_resizing_and_pad_strategy(self):
-        image_1 = ops.ones(200, 100, 3, dtype=mindspore.uint8)
-
-        # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
-        image_processor = DeformableDetrImageProcessor(
-            size={"max_height": 100, "max_width": 100},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_1], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 100, 50))
-
-        # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
-        image_processor = DeformableDetrImageProcessor(
-            size={"max_height": 300, "max_width": 100},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_1], return_tensors="ms")
-
-        # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
-        image_processor = DeformableDetrImageProcessor(
-            size={"max_height": 100, "max_width": 100},
-            do_pad=True,
-            pad_size={"height": 100, "width": 100},
-        )
-        inputs = image_processor(images=[image_1], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 100, 100))
-
-        # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
-        image_processor = DeformableDetrImageProcessor(
-            size={"max_height": 300, "max_width": 100},
-            do_pad=True,
-            pad_size={"height": 301, "width": 101},
-        )
-        inputs = image_processor(images=[image_1], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 301, 101))
-
-        ### Check for batch
-        image_2 = ops.ones(100, 150, 3, dtype=mindspore.uint8)
-
-        # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
-        image_processor = DeformableDetrImageProcessor(
-            size={"max_height": 150, "max_width": 100},
-            do_pad=True,
-            pad_size={"height": 150, "width": 100},
-        )
-        inputs = image_processor(images=[image_1, image_2], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 150, 100))
-
-    def test_longest_edge_shortest_edge_resizing_strategy(self):
-        image_1 = ops.ones(958, 653, 3, dtype=mindspore.uint8)
-
-        # max size is set; width < height;
-        # do_pad=False, longest_edge=640, shortest_edge=640, image=958x653 -> 640x436
-        image_processor = DeformableDetrImageProcessor(
-            size={"longest_edge": 640, "shortest_edge": 640},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_1], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 640, 436))
-
-        image_2 = ops.ones(653, 958, 3, dtype=mindspore.uint8)
-        # max size is set; height < width;
-        # do_pad=False, longest_edge=640, shortest_edge=640, image=653x958 -> 436x640
-        image_processor = DeformableDetrImageProcessor(
-            size={"longest_edge": 640, "shortest_edge": 640},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_2], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 436, 640))
-
-        image_3 = ops.ones(100, 120, 3, dtype=mindspore.uint8)
-        # max size is set; width == size; height > max_size;
-        # do_pad=False, longest_edge=118, shortest_edge=100, image=120x100 -> 118x98
-        image_processor = DeformableDetrImageProcessor(
-            size={"longest_edge": 118, "shortest_edge": 100},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_3], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 98, 118))
-
-        image_4 = ops.ones(128, 50, 3, dtype=mindspore.uint8)
-        # max size is set; height == size; width < max_size;
-        # do_pad=False, longest_edge=256, shortest_edge=50, image=50x128 -> 50x128
-        image_processor = DeformableDetrImageProcessor(
-            size={"longest_edge": 256, "shortest_edge": 50},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_4], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 128, 50))
-
-        image_5 = ops.ones(50, 50, 3, dtype=mindspore.uint8)
-        # max size is set; height == width; width < max_size;
-        # do_pad=False, longest_edge=117, shortest_edge=50, image=50x50 -> 50x50
-        image_processor = DeformableDetrImageProcessor(
-            size={"longest_edge": 117, "shortest_edge": 50},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_5], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 50, 50))
diff --git a/tests/transformers/models/deformable_detr/test_modeling_deformable_detr.py b/tests/transformers/models/deformable_detr/test_modeling_deformable_detr.py
deleted file mode 100644
index 536f7d9ed..000000000
--- a/tests/transformers/models/deformable_detr/test_modeling_deformable_detr.py
+++ /dev/null
@@ -1,890 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Deformable DETR model."""
-
-import inspect
-import math
-import unittest
-from typing import Dict, List, Tuple
-
-import numpy as np
-
-from mindnlp.core import ops
-from mindnlp.transformers import DeformableDetrConfig, ResNetConfig
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from mindnlp.utils import cached_property
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import (
-        DeformableDetrForObjectDetection,
-        DeformableDetrModel,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import AutoImageProcessor
-
-
-class DeformableDetrModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=8,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=8,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        num_queries=12,
-        num_channels=3,
-        image_size=196,
-        n_targets=8,
-        num_labels=91,
-        num_feature_levels=4,
-        encoder_n_points=2,
-        decoder_n_points=6,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_queries = num_queries
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.n_targets = n_targets
-        self.num_labels = num_labels
-        self.num_feature_levels = num_feature_levels
-        self.encoder_n_points = encoder_n_points
-        self.decoder_n_points = decoder_n_points
-
-        # we also set the expected seq length for both encoder and decoder
-        self.encoder_seq_length = (
-            math.ceil(self.image_size / 8) ** 2
-            + math.ceil(self.image_size / 16) ** 2
-            + math.ceil(self.image_size / 32) ** 2
-            + math.ceil(self.image_size / 64) ** 2
-        )
-        self.decoder_seq_length = self.num_queries
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [self.batch_size, self.num_channels, self.image_size, self.image_size]
-        )
-
-        pixel_mask = ops.ones(self.batch_size, self.image_size, self.image_size)
-
-        labels = None
-        if self.use_labels:
-            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
-            labels = []
-            for i in range(self.batch_size):
-                target = {}
-                target["class_labels"] = ops.randint(
-                    low=0, high=self.num_labels, size=(self.n_targets,)
-                )
-                target["boxes"] = ops.rand(self.n_targets, 4)
-                target["masks"] = ops.rand(
-                    self.n_targets, self.image_size, self.image_size
-                )
-                labels.append(target)
-
-        config = self.get_config()
-        return config, pixel_values, pixel_mask, labels
-
-    def get_config(self):
-        resnet_config = ResNetConfig(
-            num_channels=3,
-            embeddings_size=10,
-            hidden_sizes=[10, 20, 30, 40],
-            depths=[1, 1, 2, 1],
-            hidden_act="relu",
-            num_labels=3,
-            out_features=["stage2", "stage3", "stage4"],
-            out_indices=[2, 3, 4],
-        )
-        return DeformableDetrConfig(
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            num_queries=self.num_queries,
-            num_labels=self.num_labels,
-            num_feature_levels=self.num_feature_levels,
-            encoder_n_points=self.encoder_n_points,
-            decoder_n_points=self.decoder_n_points,
-            use_timm_backbone=False,
-            backbone=None,
-            backbone_config=resnet_config,
-            use_pretrained_backbone=False,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
-        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
-        return config, inputs_dict
-
-    def create_and_check_deformable_detr_model(
-        self, config, pixel_values, pixel_mask, labels
-    ):
-        model = DeformableDetrModel(config=config)
-        model.set_train(False)
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.num_queries, self.hidden_size),
-        )
-
-    def create_and_check_deformable_detr_object_detection_head_model(
-        self, config, pixel_values, pixel_mask, labels
-    ):
-        model = DeformableDetrForObjectDetection(config=config)
-        model.set_train(False)
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)
-        )
-        self.parent.assertEqual(
-            result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)
-        )
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_queries, self.num_labels)
-        )
-        self.parent.assertEqual(
-            result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)
-        )
-
-
-@require_mindspore
-class DeformableDetrModelTest(
-    ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
-):
-    all_model_classes = (
-        (DeformableDetrModel, DeformableDetrForObjectDetection)
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "image-feature-extraction": DeformableDetrModel,
-            "object-detection": DeformableDetrForObjectDetection,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_torchscript = False
-    test_pruning = False
-    test_head_masking = False
-    test_missing_keys = False
-
-    # special case for head models
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(
-            inputs_dict, model_class, return_labels=return_labels
-        )
-
-        if return_labels:
-            if model_class.__name__ == "DeformableDetrForObjectDetection":
-                labels = []
-                for i in range(self.model_tester.batch_size):
-                    target = {}
-                    target["class_labels"] = ops.ones(
-                        self.model_tester.n_targets,
-                        dtype=mindspore.int64,
-                    )
-                    target["boxes"] = ops.ones(
-                        self.model_tester.n_targets,
-                        4,
-                        dtype=mindspore.float32,
-                    )
-                    target["masks"] = ops.ones(
-                        self.model_tester.n_targets,
-                        self.model_tester.image_size,
-                        self.model_tester.image_size,
-                        dtype=mindspore.float32,
-                    )
-                    labels.append(target)
-                inputs_dict["labels"] = labels
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = DeformableDetrModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=DeformableDetrConfig,
-            has_text_modality=False,
-            common_properties=[
-                "num_channels",
-                "d_model",
-                "encoder_attention_heads",
-                "decoder_attention_heads",
-            ],
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_deformable_detr_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_deformable_detr_model(*config_and_inputs)
-
-    def test_deformable_detr_object_detection_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_deformable_detr_object_detection_head_model(
-            *config_and_inputs
-        )
-
-    @unittest.skip(reason="Deformable DETR does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Deformable DETR does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(reason="Deformable DETR does not have a get_input_embeddings method")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Deformable DETR is not a generative model")
-    def test_generate_without_input_ids(self):
-        pass
-
-    @unittest.skip(reason="Deformable DETR does not use token embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_feature_levels,
-                    self.model_tester.encoder_n_points,
-                ],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 8
-
-            # loss is at first position
-            if "labels" in inputs_dict:
-                correct_outlen += 1  # loss is added to beginning
-            # Object Detection model returns pred_logits and pred_boxes
-            if model_class.__name__ == "DeformableDetrForObjectDetection":
-                correct_outlen += 2
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(
-                len(decoder_attentions), self.model_tester.num_hidden_layers
-            )
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_queries,
-                    self.model_tester.num_queries,
-                ],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_feature_levels,
-                    self.model_tester.decoder_n_points,
-                ],
-            )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_feature_levels,
-                    self.model_tester.encoder_n_points,
-                ],
-            )
-
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def set_nan_tensor_to_zero(t):
-            t[t != t] = 0
-            return t
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
-            dict_output = model(
-                **dict_inputs, return_dict=True, **additional_kwargs
-            ).to_tuple()
-
-            def recursive_check(tuple_object, dict_object):
-                if isinstance(tuple_object, (List, Tuple)):
-                    for tuple_iterable_value, dict_iterable_value in zip(
-                        tuple_object, dict_object
-                    ):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif isinstance(tuple_object, Dict):
-                    for tuple_iterable_value, dict_iterable_value in zip(
-                        tuple_object.values(), dict_object.values()
-                    ):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif tuple_object is None:
-                    return
-                else:
-                    self.assertTrue(
-                        np.allclose(
-                            set_nan_tensor_to_zero(tuple_object),
-                            set_nan_tensor_to_zero(dict_object),
-                            atol=1e-5,
-                        ),
-                        msg=(
-                            "Tuple and dict output are not equal. Difference:"
-                            f" {ops.max(ops.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                            f" {ops.isnan(tuple_object).any()} and `inf`: {ops.isinf(tuple_object)}. Dict has"
-                            f" `nan`: {ops.isnan(dict_object).any()} and `inf`: {ops.isinf(dict_object)}."
-                        ),
-                    )
-
-                recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            print("Model class:", model_class)
-            model = model_class(config)
-
-            model.set_train(False)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True
-            )
-            dict_inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True
-            )
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(
-                model, tuple_inputs, dict_inputs, {"output_hidden_states": True}
-            )
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(
-                model, tuple_inputs, dict_inputs, {"output_attentions": True}
-            )
-
-            tuple_inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True
-            )
-            dict_inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True
-            )
-            check_equivalence(
-                model, tuple_inputs, dict_inputs, {"output_hidden_states": True}
-            )
-
-            tuple_inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True
-            )
-            dict_inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True
-            )
-            check_equivalence(
-                model, tuple_inputs, dict_inputs, {"output_attentions": True}
-            )
-
-            tuple_inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True
-            )
-            dict_inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True
-            )
-            check_equivalence(
-                model,
-                tuple_inputs,
-                dict_inputs,
-                {"output_hidden_states": True, "output_attentions": True},
-            )
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        inputs = self._prepare_for_class(inputs_dict, model_class)
-
-        outputs = model(**inputs)
-
-        # we take the second output since last_hidden_state is the second item
-        output = outputs[1]
-
-        encoder_hidden_states = outputs.encoder_hidden_states[0]
-        encoder_attentions = outputs.encoder_attentions[0]
-        # encoder_hidden_states.retain_grad()
-        # encoder_attentions.retain_grad()
-
-        decoder_attentions = outputs.decoder_attentions[0]
-        # decoder_attentions.retain_grad()
-
-        cross_attentions = outputs.cross_attentions[0]
-        # cross_attentions.retain_grad()
-
-        # output.flatten()[0].backward(retain_graph=True)
-
-        # self.assertIsNotNone(encoder_hidden_states.grad)
-        # self.assertIsNotNone(encoder_attentions.grad)
-        # self.assertIsNotNone(decoder_attentions.grad)
-        # self.assertIsNotNone(cross_attentions.grad)
-
-    def test_forward_auxiliary_loss(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.auxiliary_loss = True
-
-        # only test for object detection and segmentation model
-        for model_class in self.all_model_classes[1:]:
-            model = model_class(config)
-
-            inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True
-            )
-
-            outputs = model(**inputs)
-
-            self.assertIsNotNone(outputs.auxiliary_outputs)
-            self.assertEqual(
-                len(outputs.auxiliary_outputs), self.model_tester.num_hidden_layers - 1
-            )
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = ["pixel_values", "pixel_mask"]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
-                    if "head_mask" and "decoder_head_mask" in arg_names
-                    else []
-                )
-                self.assertListEqual(
-                    arg_names[: len(expected_arg_names)], expected_arg_names
-                )
-            else:
-                expected_arg_names = ["pixel_values", "pixel_mask"]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    @unittest.skip("MindNLP does not depend on timm")
-    def test_different_timm_backbone(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # let's pick a random timm backbone
-        config.backbone = "tf_mobilenetv3_small_075.in1k"
-        config.backbone_config = None
-        config.use_timm_backbone = True
-        config.backbone_kwargs = {"out_indices": [1, 2, 3, 4]}
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if model_class.__name__ == "DeformableDetrForObjectDetection":
-                expected_shape = (
-                    self.model_tester.batch_size,
-                    self.model_tester.num_queries,
-                    self.model_tester.num_labels,
-                )
-                self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
-                self.assertEqual(
-                    len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4
-                )
-            else:
-                # Confirm out_indices was propogated to backbone
-                self.assertEqual(
-                    len(model.backbone.conv_encoder.intermediate_channel_sizes), 4
-                )
-
-            self.assertTrue(outputs)
-
-    def test_hf_backbone(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Load a pretrained HF checkpoint as backbone
-        config.backbone = "microsoft/resnet-18"
-        config.backbone_config = None
-        config.use_timm_backbone = False
-        config.use_pretrained_backbone = True
-        config.backbone_kwargs = {"out_indices": [1, 2, 3, 4]}
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if model_class.__name__ == "DeformableDetrForObjectDetection":
-                expected_shape = (
-                    self.model_tester.batch_size,
-                    self.model_tester.num_queries,
-                    self.model_tester.num_labels,
-                )
-                self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
-                self.assertEqual(
-                    len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4
-                )
-            else:
-                # Confirm out_indices was propogated to backbone
-                self.assertEqual(
-                    len(model.backbone.conv_encoder.intermediate_channel_sizes), 4
-                )
-
-            self.assertTrue(outputs)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            print("Model class:", model_class)
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if param.requires_grad:
-                        if (
-                            "level_embed" in name
-                            or "sampling_offsets.bias" in name
-                            or "value_proj" in name
-                            or "output_proj" in name
-                            or "reference_points" in name
-                        ):
-                            continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
-    def test_two_stage_training(self):
-        model_class = DeformableDetrForObjectDetection
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-        config.two_stage = True
-        config.auxiliary_loss = True
-        config.with_box_refine = True
-
-        model = model_class(config)
-
-        model.set_train(True)
-        inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-        loss = model(**inputs).loss
-        # loss.backward()
-
-    def create_and_check_model_fp16_forward(self):
-        model_class = DeformableDetrForObjectDetection
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        model = model_class(config)
-        model.half()
-        model.set_train(False)
-        inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-        output = model(**inputs)["last_hidden_state"]
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-    def create_and_check_model_bf16_forward(self):
-        model_class = DeformableDetrForObjectDetection
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        model = model_class(config, torch_dtype=mindspore.bfloat16)
-        model.set_train(False)
-        inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-        output = model(**inputs)["last_hidden_state"]
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-
-TOLERANCE = 1e-4
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_vision
-@slow
-class DeformableDetrModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            AutoImageProcessor.from_pretrained(
-                "SenseTime/deformable-detr", from_pt=True
-            )
-            if is_vision_available()
-            else None
-        )
-
-    def test_inference_object_detection_head(self):
-        model = DeformableDetrForObjectDetection.from_pretrained(
-            "SenseTime/deformable-detr", from_pt=True
-        )
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        encoding = image_processor(images=image, return_tensors="ms")
-        pixel_values = encoding["pixel_values"]
-        pixel_mask = encoding["pixel_mask"]
-
-        outputs = model(pixel_values, pixel_mask)
-
-        expected_shape_logits = (1, model.config.num_queries, model.config.num_labels)
-
-        self.assertEqual(outputs.logits.shape, expected_shape_logits)
-
-        expected_logits = mindspore.tensor(
-            [
-                [-9.6645, -4.3449, -5.8705],
-                [-9.7035, -3.8504, -5.0724],
-                [-10.5634, -5.3379, -7.5116],
-            ]
-        )
-        expected_boxes = mindspore.tensor(
-            [
-                [0.8693, 0.2289, 0.2492],
-                [0.3150, 0.5489, 0.5845],
-                [0.5563, 0.7580, 0.8518],
-            ]
-        )
-
-        self.assertTrue(
-            np.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)
-        )
-
-        expected_shape_boxes = (1, model.config.num_queries, 4)
-        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
-        self.assertTrue(
-            np.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)
-        )
-
-        # verify postprocessing
-        results = image_processor.post_process_object_detection(
-            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
-        )[0]
-        expected_scores = mindspore.tensor([0.7999, 0.7894, 0.6331, 0.4720, 0.4382])
-        expected_labels = [17, 17, 75, 75, 63]
-        expected_slice_boxes = mindspore.tensor([16.5028, 52.8390, 318.2544, 470.7841])
-        self.assertEqual(len(results["scores"]), 5)
-        self.assertTrue(np.allclose(results["scores"], expected_scores, atol=1e-4))
-        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
-        self.assertTrue(np.allclose(results["boxes"][0, :], expected_slice_boxes))
-
-    def test_inference_object_detection_head_with_box_refine_two_stage(self):
-        model = DeformableDetrForObjectDetection.from_pretrained(
-            "SenseTime/deformable-detr-with-box-refine-two-stage"
-        )
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        encoding = image_processor(images=image, return_tensors="ms")
-        pixel_values = encoding["pixel_values"]
-        pixel_mask = encoding["pixel_mask"]
-
-        outputs = model(pixel_values, pixel_mask)
-
-        expected_shape_logits = (1, model.config.num_queries, model.config.num_labels)
-
-        self.assertEqual(outputs.logits.shape, expected_shape_logits)
-
-        expected_logits = mindspore.tensor(
-            [
-                [-6.7108, -4.3213, -6.3777],
-                [-8.9014, -6.1799, -6.7240],
-                [-6.9315, -4.4735, -6.2298],
-            ]
-        )
-        expected_boxes = mindspore.tensor(
-            [
-                [0.2583, 0.5499, 0.4683],
-                [0.7652, 0.9068, 0.4882],
-                [0.5490, 0.2763, 0.0564],
-            ]
-        )
-
-        self.assertTrue(
-            np.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)
-        )
-
-        expected_shape_boxes = (1, model.config.num_queries, 4)
-        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
-        self.assertTrue(
-            np.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)
-        )
-
-    def test_inference_object_detection_head_equivalence_cpu_gpu(self):
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        encoding = image_processor(images=image, return_tensors="ms")
-        pixel_values = encoding["pixel_values"]
-        pixel_mask = encoding["pixel_mask"]
-
-        # 1. run model on CPU
-        model = DeformableDetrForObjectDetection.from_pretrained(
-            "SenseTime/deformable-detr-single-scale"
-        )
-
-        cpu_outputs = model(pixel_values, pixel_mask)
-
-        # 2. run model on GPU
-
-        gpu_outputs = model(pixel_values, pixel_mask)
-
-        # 3. assert equivalence
-        for key in cpu_outputs.keys():
-            assert np.allclose(cpu_outputs[key], gpu_outputs[key].cpu(), atol=1e-4)
-
-        expected_logits = mindspore.tensor(
-            [
-                [-9.9051, -4.2541, -6.4852],
-                [-9.6947, -4.0854, -6.8033],
-                [-10.0665, -5.8470, -7.7003],
-            ]
-        )
-        assert np.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-4)
diff --git a/tests/transformers/models/deit/__init__.py b/tests/transformers/models/deit/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/deit/test_image_processing_deit.py b/tests/transformers/models/deit/test_image_processing_deit.py
deleted file mode 100644
index a1409271b..000000000
--- a/tests/transformers/models/deit/test_image_processing_deit.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_vision_available():
-    from mindnlp.transformers import DeiTImageProcessor
-
-
-class DeiTImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"height": 20, "width": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class DeiTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = DeiTImageProcessor if is_vision_available() else None
-    test_cast_dtype = True
-
-    def setUp(self):
-        self.image_processor_tester = DeiTImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_center_crop"))
-        self.assertTrue(hasattr(image_processing, "center_crop"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"height": 20, "width": 20})
-        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
-        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
diff --git a/tests/transformers/models/deit/test_modeling_deit.py b/tests/transformers/models/deit/test_modeling_deit.py
deleted file mode 100644
index df6f92709..000000000
--- a/tests/transformers/models/deit/test_modeling_deit.py
+++ /dev/null
@@ -1,426 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the Mindspore DeiT model."""
-
-import unittest
-import warnings
-import numpy as np
-
-from mindnlp.transformers import DeiTConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-    is_vision_available,
-    is_mindspore_available
-)
-from mindnlp.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import (
-        DeiTForImageClassification,
-        DeiTForImageClassificationWithTeacher,
-        DeiTForMaskedImageModeling,
-        DeiTModel,
-    )
-    from mindnlp.transformers.models.auto.modeling_auto import (
-        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
-        MODEL_MAPPING_NAMES,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import DeiTImageProcessor
-
-
-class DeiTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-        encoder_stride=2,
-        mask_ratio=0.5,
-        attn_implementation="eager",
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.encoder_stride = encoder_stride
-        self.attn_implementation = attn_implementation
-
-        # in DeiT, the seq length equals the number of patches + 2 (we add 2 for the [CLS] and distilation tokens)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 2
-        self.mask_ratio = mask_ratio
-        self.num_masks = int(mask_ratio * self.seq_length)
-        self.mask_length = num_patches
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return DeiTConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-            attn_implementation=self.attn_implementation,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = DeiTModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
-        model = DeiTForMaskedImageModeling(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
-        )
-
-        # test greyscale images
-        config.num_channels = 1
-        model = DeiTForMaskedImageModeling(config)
-        model.set_train(False)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = DeiTForImageClassification(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = DeiTForImageClassification(config)
-        model.set_train(False)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class DeiTModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as DeiT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            DeiTForImageClassification,
-            DeiTForImageClassificationWithTeacher,
-            DeiTForMaskedImageModeling,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "image-feature-extraction": DeiTModel,
-            "image-classification": (DeiTForImageClassification, DeiTForImageClassificationWithTeacher),
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = DeiTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DeiTConfig, has_text_modality=False, hidden_size=37)
-
-    @unittest.skip(
-        "Since `torch==2.3+cu121`, although this test passes, many subsequent tests have `CUDA error: misaligned address`."
-        "If `nvidia-xxx-cu118` are also installed, no failure (even with `torch==2.3+cu121`)."
-    )
-    def test_multi_gpu_data_parallel_forward(self):
-        super().test_multi_gpu_data_parallel_forward()
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="DeiT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="DeiT does not use input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Dense))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    # special case for DeiTForImageClassificationWithTeacher model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "DeiTForImageClassificationWithTeacher":
-                del inputs_dict["labels"]
-
-        return inputs_dict
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            # DeiTForImageClassificationWithTeacher supports inference-only
-            if (
-                model_class.__name__ in MODEL_MAPPING_NAMES.values()
-                or model_class.__name__ == "DeiTForImageClassificationWithTeacher"
-            ):
-                continue
-            model = model_class(config)
-            model.set_train(True)
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    def test_problem_types(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        problem_types = [
-            {"title": "multi_label_classification", "num_labels": 2, "dtype": ms.float32},
-            {"title": "single_label_classification", "num_labels": 1, "dtype": ms.int64},
-            {"title": "regression", "num_labels": 1, "dtype": ms.float32},
-        ]
-
-        for model_class in self.all_model_classes:
-            if (
-                model_class.__name__
-                not in [
-                    *MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values(),
-                    *MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES.values(),
-                ]
-                or model_class.__name__ == "DeiTForImageClassificationWithTeacher"
-            ):
-                continue
-
-            for problem_type in problem_types:
-                with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-                    config.problem_type = problem_type["title"]
-                    config.num_labels = problem_type["num_labels"]
-
-                    model = model_class(config)
-                    model.set_train(True)
-
-                    inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-                    if problem_type["num_labels"] > 1:
-                        inputs["labels"] = inputs["labels"].unsqueeze(1).repeat(1, problem_type["num_labels"])
-
-                    inputs["labels"] = inputs["labels"].astype(problem_type["dtype"])
-
-                    # This tests that we do not trigger the warning form PyTorch "Using a target size that is different
-                    # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure
-                    # they have the same size." which is a symptom something in wrong for the regression problem.
-                    # See https://github.com/huggingface/transformers/issues/11780
-                    with warnings.catch_warnings(record=True) as warning_list:
-                        loss = model(**inputs).loss
-                    for w in warning_list:
-                        if "Using a target size that is different to the input size" in str(w.message):
-                            raise ValueError(
-                                f"Something is going wrong in the regression problem: intercepted {w.message}"
-                            )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/deit-base-distilled-patch16-224"
-        model = DeiTModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class DeiTModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            DeiTImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = DeiTForImageClassificationWithTeacher.from_pretrained("facebook/deit-base-distilled-patch16-224")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = ms.tensor([-1.0266, 0.1912, -1.2861])
-
-        self.assertTrue(np.allclose(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-    @slow
-    def test_inference_interpolate_pos_encoding(self):
-        model = DeiTForImageClassificationWithTeacher.from_pretrained("facebook/deit-base-distilled-patch16-224")
-
-        image_processor = self.default_image_processor
-
-        # image size is {"height": 480, "width": 640}
-        image = prepare_img()
-        image_processor.size = {"height": 480, "width": 640}
-        # center crop set to False so image is not center cropped to 224x224
-        inputs = image_processor(images=image, return_tensors="ms", do_center_crop=False)
-
-        # forward pass
-        outputs = model(**inputs, interpolate_pos_encoding=True)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-    @slow
-    def test_inference_fp16(self):
-        r"""
-        A small test to make sure that inference work in half precision without any problem.
-        """
-        model = DeiTModel.from_pretrained(
-            "facebook/deit-base-distilled-patch16-224", ms_dtype=ms.float16, from_pt=True
-        )
-        image_processor = self.default_image_processor
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-        pixel_values = inputs.pixel_values
-
-        # forward pass to make sure inference works in fp16
-        _ = model(pixel_values)
diff --git a/tests/transformers/models/depth_anything/__init__.py b/tests/transformers/models/depth_anything/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/depth_anything/test_modeling_depth_anything.py b/tests/transformers/models/depth_anything/test_modeling_depth_anything.py
deleted file mode 100644
index 8d9580af2..000000000
--- a/tests/transformers/models/depth_anything/test_modeling_depth_anything.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the mindspore Depth Anything model."""
-
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import DepthAnythingConfig, Dinov2Config
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.transformers import DepthAnythingForDepthEstimation
-
-if is_vision_available():
-    from PIL import Image
-    from mindnlp.transformers import DPTImageProcessor
-
-
-class DepthAnythingModelTester:
-    # Copied from tests.models.dpt.test_modeling_dpt_auto_backbone.DPTModelTester.__init__
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        num_channels=3,
-        image_size=32,
-        patch_size=16,
-        use_labels=True,
-        num_labels=3,
-        is_training=True,
-        hidden_size=4,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        intermediate_size=8,
-        out_features=["stage1", "stage2"],
-        apply_layernorm=False,
-        reshape_hidden_states=False,
-        neck_hidden_sizes=[2, 2],
-        fusion_hidden_size=6,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.out_features = out_features
-        self.apply_layernorm = apply_layernorm
-        self.reshape_hidden_states = reshape_hidden_states
-        self.use_labels = use_labels
-        self.num_labels = num_labels
-        self.is_training = is_training
-        self.neck_hidden_sizes = neck_hidden_sizes
-        self.fusion_hidden_size = fusion_hidden_size
-        # DPT's sequence length
-        self.seq_length = (self.image_size // self.patch_size) ** 2 + 1
-
-    # Copied from tests.models.dpt.test_modeling_dpt_auto_backbone.DPTModelTester.prepare_config_and_inputs
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return DepthAnythingConfig(
-            backbone_config=self.get_backbone_config(),
-            reassemble_hidden_size=self.hidden_size,
-            patch_size=self.patch_size,
-            neck_hidden_sizes=self.neck_hidden_sizes,
-            fusion_hidden_size=self.fusion_hidden_size,
-        )
-
-    # Copied from tests.models.dpt.test_modeling_dpt_auto_backbone.DPTModelTester.get_backbone_config
-    def get_backbone_config(self):
-        return Dinov2Config(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            is_training=self.is_training,
-            out_features=self.out_features,
-            reshape_hidden_states=self.reshape_hidden_states,
-        )
-
-    # Copied from tests.models.dpt.test_modeling_dpt_auto_backbone.DPTModelTester.create_and_check_for_depth_estimation with DPT->DepthAnything
-    def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = DepthAnythingForDepthEstimation(config)
-        # model
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size))
-
-    # Copied from tests.models.dpt.test_modeling_dpt_auto_backbone.DPTModelTester.prepare_config_and_inputs_for_common
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class DepthAnythingModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Depth Anything does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (DepthAnythingForDepthEstimation,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"depth-estimation": DepthAnythingForDepthEstimation} if is_mindspore_available() else {}
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = DepthAnythingModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=DepthAnythingConfig,
-            has_text_modality=False,
-            hidden_size=37,
-            common_properties=["patch_size"],
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="Depth Anything with AutoBackbone does not have a base model and hence no input_embeddings")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_for_depth_estimation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs)
-
-    @unittest.skip(reason="Depth Anything does not support training yet")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="Depth Anything does not support training yet")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(reason="Depth Anything with AutoBackbone does not have a base model and hence no input_embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Depth Anything with AutoBackbone does not have a base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Depth Anything with AutoBackbone does not have a base model")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "LiheYoung/depth-anything-small-hf"
-        model = DepthAnythingForDepthEstimation.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    # def test_backbone_selection(self):
-    #     def _validate_backbone_init():
-    #         for model_class in self.all_model_classes:
-    #             model = model_class(config)
-    #             model.to()
-    #             model.eval()
-    #
-    #             # Confirm out_indices propogated to backbone
-    #             self.assertEqual(len(model.backbone.out_indices), 2)
-    #
-    #     config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    #
-    #     # Load a timm backbone
-    #     config.backbone = "resnet18"
-    #     config.use_pretrained_backbone = True
-    #     config.use_timm_backbone = True
-    #     config.backbone_config = None
-    #     # For transformer backbones we can't set the out_indices or just return the features
-    #     config.backbone_kwargs = {"out_indices": (-2, -1)}
-    #     _validate_backbone_init()
-    #
-    #     # Load a HF backbone
-    #     config.backbone = "facebook/dinov2-small"
-    #     config.use_pretrained_backbone = True
-    #     config.use_timm_backbone = False
-    #     config.backbone_config = None
-    #     config.backbone_kwargs = {"out_indices": [-2, -1]}
-    #     _validate_backbone_init()
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-@slow
-class DepthAnythingModelIntegrationTest(unittest.TestCase):
-    def test_inference(self):
-        # -- `relative` depth model --
-        image_processor = DPTImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
-        model = DepthAnythingForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf")
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-        predicted_depth = outputs.predicted_depth
-
-        # verify the predicted depth
-        expected_shape = (1, 518, 686)
-        self.assertEqual(predicted_depth.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[8.8204, 8.6468, 8.6195], [8.3313, 8.6027, 8.7526], [8.6526, 8.6866, 8.7453]],
-        )
-
-        self.assertTrue(np.allclose(predicted_depth[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-6))
-
-        # -- `metric` depth model --
-        image_processor = DPTImageProcessor.from_pretrained("depth-anything/depth-anything-V2-metric-indoor-small-hf")
-        model = DepthAnythingForDepthEstimation.from_pretrained(
-            "depth-anything/depth-anything-V2-metric-indoor-small-hf"
-        )
-
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-        predicted_depth = outputs.predicted_depth
-
-        # verify the predicted depth
-        expected_shape = (1, 518, 686)
-        self.assertEqual(predicted_depth.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[1.3349, 1.2946, 1.2801], [1.2793, 1.2337, 1.2899], [1.2629, 1.2218, 1.2476]],
-        )
-
-        self.assertTrue(np.allclose(predicted_depth[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/deta/__init__.py b/tests/transformers/models/deta/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/deta/test_image_processing_deta.py b/tests/transformers/models/deta/test_image_processing_deta.py
deleted file mode 100644
index 45525b14d..000000000
--- a/tests/transformers/models/deta/test_image_processing_deta.py
+++ /dev/null
@@ -1,703 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import pathlib
-import unittest
-import numpy as np
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import (
-    AnnotationFormatTestMixin,
-    ImageProcessingTestMixin,
-    prepare_image_inputs,
-)
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-if is_vision_available():
-    from PIL import Image
-    from mindnlp.transformers import DetaImageProcessor
-
-
-class DetaImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_pad=True,
-    ):
-        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
-        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_pad = do_pad
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_pad": self.do_pad,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to DetaImageProcessor,
-        assuming do_resize is set to True with a scalar size.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-    def expected_output_image_shape(self, images):
-        height, width = self.get_expected_values(images, batched=True)
-        return self.num_channels, height, width
-
-    def prepare_image_inputs(
-        self, equal_resolution=False, numpify=False, torchify=False
-    ):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class DetaImageProcessingTest(
-    AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase
-):
-    image_processing_class = DetaImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = DetaImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "do_rescale"))
-        self.assertTrue(hasattr(image_processing, "do_pad"))
-        self.assertTrue(hasattr(image_processing, "size"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict
-        )
-        self.assertEqual(
-            image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}
-        )
-        self.assertEqual(image_processor.do_pad, True)
-
-    # @slow
-    def test_call_pytorch_with_coco_detection_annotations(self):
-        # prepare image and target
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"image_id": 39769, "annotations": target}
-
-        # encode them
-        image_processing = DetaImageProcessor()
-        encoding = image_processing(
-            images=image, annotations=target, return_tensors="ms"
-        )
-
-        # verify pixel values
-        expected_shape = (1, 3, 800, 1066)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = mindspore.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(
-            np.allclose(
-                encoding["pixel_values"][0, 0, 0, :3].asnumpy(),
-                expected_slice.asnumpy(),
-                atol=1e-4,
-            )
-        )
-
-        # verify area
-        expected_area = mindspore.tensor(
-            [5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]
-        )
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["area"].asnumpy(), expected_area.asnumpy()
-            )
-        )
-        # verify boxes
-        expected_boxes_shape = (6, 4)
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = mindspore.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["boxes"][0].asnumpy(),
-                expected_boxes_slice.asnumpy(),
-                atol=1e-3,
-            )
-        )
-        # verify image_id
-        expected_image_id = mindspore.tensor([39769])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["image_id"].asnumpy(), expected_image_id.asnumpy()
-            )
-        )
-        # verify is_crowd
-        expected_is_crowd = mindspore.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["iscrowd"].asnumpy(), expected_is_crowd.asnumpy()
-            )
-        )
-        # verify class_labels
-        expected_class_labels = mindspore.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["class_labels"].asnumpy(),
-                expected_class_labels.asnumpy(),
-            )
-        )
-        # verify orig_size
-        expected_orig_size = mindspore.tensor([480, 640])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["orig_size"].asnumpy(),
-                expected_orig_size.asnumpy(),
-            )
-        )
-        # verify size
-        expected_size = mindspore.tensor([800, 1066])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["size"].asnumpy(), expected_size.asnumpy()
-            )
-        )
-
-    # @slow
-    def test_call_pytorch_with_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open(
-            "./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r"
-        ) as f:
-            target = json.loads(f.read())
-
-        target = {
-            "file_name": "000000039769.png",
-            "image_id": 39769,
-            "segments_info": target,
-        }
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        # encode them
-        image_processing = DetaImageProcessor(format="coco_panoptic")
-        encoding = image_processing(
-            images=image, annotations=target, masks_path=masks_path, return_tensors="ms"
-        )
-
-        # verify pixel values
-        expected_shape = (1, 3, 800, 1066)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = mindspore.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(
-            np.allclose(
-                encoding["pixel_values"][0, 0, 0, :3].asnumpy(),
-                expected_slice.asnumpy(),
-                atol=1e-4,
-            )
-        )
-
-        # verify area
-        expected_area = mindspore.tensor(
-            [147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]
-        )
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["area"].asnumpy(), expected_area.asnumpy()
-            )
-        )
-        # verify boxes
-        expected_boxes_shape = (6, 4)
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = mindspore.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["boxes"][0].asnumpy(),
-                expected_boxes_slice.asnumpy(),
-                atol=1e-3,
-            )
-        )
-        # verify image_id
-        expected_image_id = mindspore.tensor([39769])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["image_id"].asnumpy(), expected_image_id.asnumpy()
-            )
-        )
-        # verify is_crowd
-        expected_is_crowd = mindspore.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["iscrowd"].asnumpy(), expected_is_crowd.asnumpy()
-            )
-        )
-        # verify class_labels
-        expected_class_labels = mindspore.tensor([17, 17, 63, 75, 75, 93])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["class_labels"].asnumpy(),
-                expected_class_labels.asnumpy(),
-            )
-        )
-        # verify masks
-        expected_masks_sum = 822873
-        self.assertEqual(
-            encoding["labels"][0]["masks"].sum().item(), expected_masks_sum
-        )
-        # verify orig_size
-        expected_orig_size = mindspore.tensor([480, 640])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["orig_size"].asnumpy(),
-                expected_orig_size.asnumpy(),
-            )
-        )
-        # verify size
-        expected_size = mindspore.tensor([800, 1066])
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["size"].asnumpy(), expected_size.asnumpy()
-            )
-        )
-
-    # @slow
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->Deta
-    def test_batched_coco_detection_annotations(self):
-        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        image_1 = Image.open(
-            "./tests/fixtures/tests_samples/COCO/000000039769.png"
-        ).resize((800, 800))
-
-        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        annotations_0 = {"image_id": 39769, "annotations": target}
-        annotations_1 = {"image_id": 39769, "annotations": target}
-
-        # Adjust the bounding boxes for the resized image
-        w_0, h_0 = image_0.size
-        w_1, h_1 = image_1.size
-        for i in range(len(annotations_1["annotations"])):
-            coords = annotations_1["annotations"][i]["bbox"]
-            new_bbox = [
-                coords[0] * w_1 / w_0,
-                coords[1] * h_1 / h_0,
-                coords[2] * w_1 / w_0,
-                coords[3] * h_1 / h_0,
-            ]
-            annotations_1["annotations"][i]["bbox"] = new_bbox
-
-        images = [image_0, image_1]
-        annotations = [annotations_0, annotations_1]
-
-        image_processing = DetaImageProcessor()
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            return_tensors="ms",  # do_convert_annotations=True
-        )
-
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1066
-        expected_shape = (2, 3, postprocessed_height, postprocessed_width)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        expected_boxes_0 = mindspore.tensor(
-            [
-                [0.6879, 0.4609, 0.0755, 0.3691],
-                [0.2118, 0.3359, 0.2601, 0.1566],
-                [0.5011, 0.5000, 0.9979, 1.0000],
-                [0.5010, 0.5020, 0.9979, 0.9959],
-                [0.3284, 0.5944, 0.5884, 0.8112],
-                [0.8394, 0.5445, 0.3213, 0.9110],
-            ]
-        )
-        expected_boxes_1 = mindspore.tensor(
-            [
-                [0.4130, 0.2765, 0.0453, 0.2215],
-                [0.1272, 0.2016, 0.1561, 0.0940],
-                [0.3757, 0.4933, 0.7488, 0.9865],
-                [0.3759, 0.5002, 0.7492, 0.9955],
-                [0.1971, 0.5456, 0.3532, 0.8646],
-                [0.5790, 0.4115, 0.3430, 0.7161],
-            ]
-        )
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["boxes"].asnumpy(),
-                expected_boxes_0.asnumpy(),
-                rtol=1e-3,
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][1]["boxes"].asnumpy(),
-                expected_boxes_1.asnumpy(),
-                rtol=1e-3,
-            )
-        )
-
-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, (6, 800, 1066))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, (6, 800, 1066))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="ms",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = ops.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = ops.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = ops.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = ops.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["boxes"].asnumpy(),
-                expected_boxes_0.asnumpy(),
-                rtol=1,
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][1]["boxes"].asnumpy(),
-                expected_boxes_1.asnumpy(),
-                rtol=1,
-            )
-        )
-
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->Deta
-    def test_batched_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        image_1 = Image.open(
-            "./tests/fixtures/tests_samples/COCO/000000039769.png"
-        ).resize((800, 800))
-
-        with open(
-            "./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r"
-        ) as f:
-            target = json.loads(f.read())
-
-        annotation_0 = {
-            "file_name": "000000039769.png",
-            "image_id": 39769,
-            "segments_info": target,
-        }
-        annotation_1 = {
-            "file_name": "000000039769.png",
-            "image_id": 39769,
-            "segments_info": target,
-        }
-
-        w_0, h_0 = image_0.size
-        w_1, h_1 = image_1.size
-        for i in range(len(annotation_1["segments_info"])):
-            coords = annotation_1["segments_info"][i]["bbox"]
-            new_bbox = [
-                coords[0] * w_1 / w_0,
-                coords[1] * h_1 / h_0,
-                coords[2] * w_1 / w_0,
-                coords[3] * h_1 / h_0,
-            ]
-            annotation_1["segments_info"][i]["bbox"] = new_bbox
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        images = [image_0, image_1]
-        annotations = [annotation_0, annotation_1]
-
-        # encode them
-        image_processing = DetaImageProcessor(format="coco_panoptic")
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_tensors="ms",
-            return_segmentation_masks=True,
-        )
-
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1066
-        expected_shape = (2, 3, postprocessed_height, postprocessed_width)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        expected_boxes_0 = mindspore.tensor(
-            [
-                [0.2625, 0.5437, 0.4688, 0.8625],
-                [0.7719, 0.4104, 0.4531, 0.7125],
-                [0.5000, 0.4927, 0.9969, 0.9854],
-                [0.1688, 0.2000, 0.2063, 0.0917],
-                [0.5492, 0.2760, 0.0578, 0.2187],
-                [0.4992, 0.4990, 0.9984, 0.9979],
-            ]
-        )
-        expected_boxes_1 = mindspore.tensor(
-            [
-                [0.1576, 0.3262, 0.2814, 0.5175],
-                [0.4634, 0.2463, 0.2720, 0.4275],
-                [0.3002, 0.2956, 0.5985, 0.5913],
-                [0.1013, 0.1200, 0.1238, 0.0550],
-                [0.3297, 0.1656, 0.0347, 0.1312],
-                [0.2997, 0.2994, 0.5994, 0.5987],
-            ]
-        )
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["boxes"].asnumpy(),
-                expected_boxes_0.asnumpy(),
-                rtol=1e-3,
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][1]["boxes"].asnumpy(),
-                expected_boxes_1.asnumpy(),
-                rtol=1e-3,
-            )
-        )
-
-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, (6, 800, 1066))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, (6, 800, 1066))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="ms",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = ops.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = ops.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = ops.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = ops.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][0]["boxes"].asnumpy(),
-                expected_boxes_0.asnumpy(),
-                rtol=1,
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                encoding["labels"][1]["boxes"].asnumpy(),
-                expected_boxes_1.asnumpy(),
-                rtol=1,
-            )
-        )
-
-    # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Deta
-    def test_max_width_max_height_resizing_and_pad_strategy(self):
-        image_1 = ops.ones([200, 100, 3], dtype=mindspore.uint8)
-
-        # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50
-        image_processor = DetaImageProcessor(
-            size={"height": 100, "width": 50},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_1], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 100, 50))
-
-        # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100
-        image_processor = DetaImageProcessor(
-            size={"height": 200, "width": 100},
-            do_pad=False,
-        )
-        inputs = image_processor(images=[image_1], return_tensors="ms")
-
-        # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100
-        image_processor = DetaImageProcessor(
-            size={"height": 100, "width": 100},
-            do_pad=True,
-            pad_size={"height": 100, "width": 100},
-        )
-        inputs = image_processor(images=[image_1], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 100, 100))
-
-        # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100
-        image_processor = DetaImageProcessor(
-            size={"height": 300, "width": 100},
-            do_pad=True,
-            pad_size={"height": 301, "width": 101},
-        )
-        inputs = image_processor(images=[image_1], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (1, 3, 301, 101))
-
-        ### Check for batch
-        image_2 = ops.ones([100, 150, 3], dtype=mindspore.uint8)
-
-        # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100
-        image_processor = DetaImageProcessor(
-            size={"height": 150, "width": 100},
-            do_pad=True,
-            pad_size={"height": 150, "width": 100},
-        )
-        inputs = image_processor(images=[image_1, image_2], return_tensors="ms")
-        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 150, 100))
diff --git a/tests/transformers/models/deta/test_modeling_deta.py b/tests/transformers/models/deta/test_modeling_deta.py
deleted file mode 100644
index f7fbed3a7..000000000
--- a/tests/transformers/models/deta/test_modeling_deta.py
+++ /dev/null
@@ -1,872 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore DETA model."""
-
-import collections
-import inspect
-import math
-import re
-import unittest
-import numpy as np
-
-from mindnlp.transformers import (
-    DetaConfig,
-    ResNetConfig,
-)
-from mindnlp.utils import cached_property
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    is_vision_available,
-    require_vision,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-    from mindnlp.transformers import DetaForObjectDetection, DetaModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import AutoImageProcessor
-
-
-class DetaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=8,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=8,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        num_queries=12,
-        two_stage_num_proposals=12,
-        num_channels=3,
-        image_size=224,
-        n_targets=8,
-        num_labels=91,
-        num_feature_levels=4,
-        encoder_n_points=2,
-        decoder_n_points=6,
-        two_stage=True,
-        assign_first_stage=True,
-        assign_second_stage=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_queries = num_queries
-        self.two_stage_num_proposals = two_stage_num_proposals
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.n_targets = n_targets
-        self.num_labels = num_labels
-        self.num_feature_levels = num_feature_levels
-        self.encoder_n_points = encoder_n_points
-        self.decoder_n_points = decoder_n_points
-        self.two_stage = two_stage
-        self.assign_first_stage = assign_first_stage
-        self.assign_second_stage = assign_second_stage
-
-        # we also set the expected seq length for both encoder and decoder
-        self.encoder_seq_length = (
-            math.ceil(self.image_size / 8) ** 2
-            + math.ceil(self.image_size / 16) ** 2
-            + math.ceil(self.image_size / 32) ** 2
-            + math.ceil(self.image_size / 64) ** 2
-        )
-        self.decoder_seq_length = self.num_queries
-
-    def prepare_config_and_inputs(self, model_class_name):
-        pixel_values = floats_tensor(
-            [self.batch_size, self.num_channels, self.image_size, self.image_size]
-        )
-
-        pixel_mask = ops.ones(self.batch_size, self.image_size, self.image_size)
-
-        labels = None
-        if self.use_labels:
-            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
-            labels = []
-            for i in range(self.batch_size):
-                target = {}
-                target["class_labels"] = ops.randint(
-                    low=0, high=self.num_labels, size=(self.n_targets,)
-                )
-                target["boxes"] = ops.rand(self.n_targets, 4)
-                target["masks"] = ops.rand(
-                    self.n_targets,
-                    self.image_size,
-                    self.image_size,
-                )
-                labels.append(target)
-
-        config = self.get_config(model_class_name)
-        return config, pixel_values, pixel_mask, labels
-
-    def get_config(self, model_class_name):
-        resnet_config = ResNetConfig(
-            num_channels=3,
-            embeddings_size=10,
-            hidden_sizes=[10, 20, 30, 40],
-            depths=[1, 1, 2, 1],
-            hidden_act="relu",
-            num_labels=3,
-            out_features=["stage2", "stage3", "stage4"],
-            out_indices=[2, 3, 4],
-        )
-        two_stage = model_class_name == "DetaForObjectDetection"
-        assign_first_stage = model_class_name == "DetaForObjectDetection"
-        assign_second_stage = model_class_name == "DetaForObjectDetection"
-        return DetaConfig(
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            num_queries=self.num_queries,
-            two_stage_num_proposals=self.two_stage_num_proposals,
-            num_labels=self.num_labels,
-            num_feature_levels=self.num_feature_levels,
-            encoder_n_points=self.encoder_n_points,
-            decoder_n_points=self.decoder_n_points,
-            two_stage=two_stage,
-            assign_first_stage=assign_first_stage,
-            assign_second_stage=assign_second_stage,
-            backbone_config=resnet_config,
-            backbone=None,
-        )
-
-    def prepare_config_and_inputs_for_common(self, model_class_name="DetaModel"):
-        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs(
-            model_class_name
-        )
-        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
-        return config, inputs_dict
-
-    def create_and_check_deta_model(self, config, pixel_values, pixel_mask, labels):
-        model = DetaModel(config=config)
-        model.set_train(False)
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.num_queries, self.hidden_size),
-        )
-
-    def create_and_check_deta_freeze_backbone(
-        self, config, pixel_values, pixel_mask, labels
-    ):
-        model = DetaModel(config=config)
-        model.set_train(False)
-
-        model.freeze_backbone()
-
-        for _, param in model.backbone.model.parameters_and_names():
-            self.parent.assertEqual(False, param.requires_grad)
-
-    def create_and_check_deta_unfreeze_backbone(
-        self, config, pixel_values, pixel_mask, labels
-    ):
-        model = DetaModel(config=config)
-        model.set_train(False)
-        model.unfreeze_backbone()
-
-        for _, param in model.backbone.model.parameters_and_names():
-            self.parent.assertEqual(True, param.requires_grad)
-
-    def create_and_check_deta_object_detection_head_model(
-        self, config, pixel_values, pixel_mask, labels
-    ):
-        model = DetaForObjectDetection(config=config)
-        model.set_train(False)
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(
-            result.logits.shape,
-            (self.batch_size, self.two_stage_num_proposals, self.num_labels),
-        )
-        self.parent.assertEqual(
-            result.pred_boxes.shape, (self.batch_size, self.two_stage_num_proposals, 4)
-        )
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(
-            result.logits.shape,
-            (self.batch_size, self.two_stage_num_proposals, self.num_labels),
-        )
-        self.parent.assertEqual(
-            result.pred_boxes.shape, (self.batch_size, self.two_stage_num_proposals, 4)
-        )
-
-
-class DetaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (DetaModel, DetaForObjectDetection)
-    pipeline_model_mapping = {
-        "image-feature-extraction": DetaModel,
-        "object-detection": DetaForObjectDetection,
-    }
-    is_encoder_decoder = True
-    test_torchscript = False
-    test_pruning = False
-    test_head_masking = False
-    test_missing_keys = False
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_casse_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        processor_name,
-    ):
-        if pipeline_test_casse_name == "ObjectDetectionPipelineTests":
-            return True
-
-        return False
-
-    @unittest.skip(
-        "Skip for now. PR #22437 causes some loading issue. See (not merged) #22656 for some discussions."
-    )
-    def test_can_use_safetensors(self):
-        super().test_can_use_safetensors()
-
-    # special case for head models
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(
-            inputs_dict, model_class, return_labels=return_labels
-        )
-
-        if return_labels:
-            if model_class.__name__ == "DetaForObjectDetection":
-                labels = []
-                for i in range(self.model_tester.batch_size):
-                    target = {}
-                    target["class_labels"] = ops.ones(
-                        (self.model_tester.n_targets,),
-                        dtype=mindspore.int64,
-                    )
-                    target["boxes"] = ops.ones(
-                        self.model_tester.n_targets,
-                        4,
-                        dtype=mindspore.float32,
-                    )
-                    target["masks"] = ops.ones(
-                        self.model_tester.n_targets,
-                        self.model_tester.image_size,
-                        self.model_tester.image_size,
-                        dtype=mindspore.float32,
-                    )
-                    labels.append(target)
-                inputs_dict["labels"] = labels
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = DetaModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=DetaConfig, has_text_modality=False
-        )
-
-    def test_config(self):
-        # we don't test common_properties and arguments_init as these don't apply for DETA
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-
-    def test_deta_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            model_class_name="DetaModel"
-        )
-        self.model_tester.create_and_check_deta_model(*config_and_inputs)
-
-    def test_deta_freeze_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            model_class_name="DetaModel"
-        )
-        self.model_tester.create_and_check_deta_freeze_backbone(*config_and_inputs)
-
-    def test_deta_unfreeze_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            model_class_name="DetaModel"
-        )
-        self.model_tester.create_and_check_deta_unfreeze_backbone(*config_and_inputs)
-
-    @unittest.skip
-    def test_deta_object_detection_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            model_class_name="DetaForObjectDetection"
-        )
-        self.model_tester.create_and_check_deta_object_detection_head_model(
-            *config_and_inputs
-        )
-
-    @unittest.skip(reason="DETA does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="DETA does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(reason="DETA does not have a get_input_embeddings method")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="DETA is not a generative model")
-    def test_generate_without_input_ids(self):
-        pass
-
-    @unittest.skip(reason="DETA does not use token embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip(reason="grid_sampler_2d_grad_cpu_kernel.h:162] store] memcpy_s failed. errorno is: 34")
-    def test_training(self):
-        pass
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_feature_levels,
-                    self.model_tester.encoder_n_points,
-                ],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 8
-
-            # loss is at first position
-            if "labels" in inputs_dict:
-                correct_outlen += 1  # loss is added to beginning
-            # Object Detection model returns pred_logits and pred_boxes
-            if model_class.__name__ == "DetaForObjectDetection":
-                correct_outlen += 2
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(
-                len(decoder_attentions), self.model_tester.num_hidden_layers
-            )
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_queries,
-                    self.model_tester.num_queries,
-                ],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_feature_levels,
-                    self.model_tester.decoder_n_points,
-                ],
-            )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_feature_levels,
-                    self.model_tester.encoder_n_points,
-                ],
-            )
-
-    # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
-    @unittest.skip(reason="MindSpore has no retain_grad")
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        inputs = self._prepare_for_class(inputs_dict, model_class)
-
-        outputs = model(**inputs)
-
-        # we take the second output since last_hidden_state is the second item
-        output = outputs[1]
-
-        encoder_hidden_states = outputs.encoder_hidden_states[0]
-        encoder_attentions = outputs.encoder_attentions[0]
-        # encoder_hidden_states.retain_grad()
-        # encoder_attentions.retain_grad()
-
-        decoder_attentions = outputs.decoder_attentions[0]
-        # decoder_attentions.retain_grad()
-
-        cross_attentions = outputs.cross_attentions[0]
-        # cross_attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(encoder_hidden_states.grad)
-        self.assertIsNotNone(encoder_attentions.grad)
-        self.assertIsNotNone(decoder_attentions.grad)
-        self.assertIsNotNone(cross_attentions.grad)
-
-    def test_forward_auxiliary_loss(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.auxiliary_loss = True
-
-        # only test for object detection and segmentation model
-        for model_class in self.all_model_classes[1:]:
-            model = model_class(config)
-
-            inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True
-            )
-
-            outputs = model(**inputs)
-
-            self.assertIsNotNone(outputs.auxiliary_outputs)
-            self.assertEqual(
-                len(outputs.auxiliary_outputs), self.model_tester.num_hidden_layers - 1
-            )
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = ["pixel_values", "pixel_mask"]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
-                    if "head_mask" and "decoder_head_mask" in arg_names
-                    else []
-                )
-                self.assertListEqual(
-                    arg_names[: len(expected_arg_names)], expected_arg_names
-                )
-            else:
-                expected_arg_names = ["pixel_values", "pixel_mask"]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    @unittest.skip(reason="Model doesn't use tied weights")
-    def test_tied_model_weights_key_ignore(self):
-        pass
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            for name, module in model.cells_and_names():
-                if module.__class__.__name__ == "DetaBackboneWithPositionalEncodings":
-                    backbone_params = [
-                        f"{name}.{key}" for key in module.parameters_dict().keys()
-                    ]
-                    break
-
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    if (
-                        "level_embed" in name
-                        or "sampling_offsets.bias" in name
-                        or "value_proj" in name
-                        or "output_proj" in name
-                        or "reference_points" in name
-                        or name in backbone_params
-                    ):
-                        continue
-                    self.assertIn(
-                        ((param.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
-    # Inspired by tests.test_modeling_common.ModelTesterMixin.test_tied_weights_keys
-    # @unittest.skip(reason="Model doesn't use tied weights")
-    def test_tied_weights_keys(self):
-        for model_class in self.all_model_classes:
-            # We need to pass model class name to correctly initialize the config.
-            # If we don't pass it, the config for `DetaForObjectDetection`` will be initialized
-            # with `two_stage=False` and the test will fail because for that case `class_embed`
-            # weights are not tied.
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common(
-                model_class_name=model_class.__name__
-            )
-            config.tie_word_embeddings = True
-
-            model_tied = model_class(config)
-
-            ptrs = collections.defaultdict(list)
-            for name, tensor in model_tied.parameters_dict().items():
-                ptrs[id_tensor_storage(tensor)].append(name)
-
-            # These are all the pointers of shared tensors.
-            tied_params = [names for _, names in ptrs.items() if len(names) > 1]
-
-            tied_weight_keys = (
-                model_tied._tied_weights_keys
-                if model_tied._tied_weights_keys is not None
-                else []
-            )
-            # Detect we get a hit for each key
-            for key in tied_weight_keys:
-                is_tied_key = any(
-                    re.search(key, p) for group in tied_params for p in group
-                )
-                self.assertTrue(
-                    is_tied_key, f"{key} is not a tied weight key for {model_class}."
-                )
-
-            # Removed tied weights found from tied params -> there should only be one left after
-            for key in tied_weight_keys:
-                for i in range(len(tied_params)):
-                    tied_params[i] = [
-                        p for p in tied_params[i] if re.search(key, p) is None
-                    ]
-
-            tied_params = [group for group in tied_params if len(group) > 1]
-            self.assertListEqual(
-                tied_params,
-                [],
-                f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.",
-            )
-
-
-TOLERANCE = 1e-4
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_vision
-@slow
-# @unittest.skip("Unsupported for batched_nms")
-class DetaModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            AutoImageProcessor.from_pretrained("jozhang97/deta-resnet-50", from_pt=True)
-            if is_vision_available()
-            else None
-        )
-
-    def test_inference_object_detection_head(self):
-        model = DetaForObjectDetection.from_pretrained(
-            "jozhang97/deta-resnet-50", from_pt=True
-        )
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        outputs = model(**inputs)
-
-        expected_shape_logits = (1, 300, model.config.num_labels)
-        self.assertEqual(outputs.logits.shape, expected_shape_logits)
-
-        expected_logits = mindspore.Tensor(
-            [
-                [-7.3978, -2.5406, -4.1668],
-                [-8.2684, -3.9933, -3.8096],
-                [-7.0515, -3.7973, -5.8516],
-            ]
-        )
-        expected_boxes = mindspore.Tensor(
-            [
-                [0.5043, 0.4973, 0.9998],
-                [0.2542, 0.5489, 0.4748],
-                [0.5490, 0.2765, 0.0570],
-            ]
-        )
-
-        self.assertTrue(
-            np.allclose(
-                outputs.logits[0, :3, :3].asnumpy(),
-                expected_logits.asnumpy(),
-                atol=1e-4,
-            )
-        )
-
-        expected_shape_boxes = (1, 300, 4)
-        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
-        self.assertTrue(
-            np.allclose(
-                outputs.pred_boxes[0, :3, :3].asnumpy(),
-                expected_boxes.asnumpy(),
-                atol=1e-4,
-            )
-        )
-
-        # verify postprocessing
-        results = image_processor.post_process_object_detection(
-            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
-        )[0]
-        expected_scores = mindspore.Tensor([0.6392, 0.6276, 0.5546, 0.5260, 0.4706])
-        expected_labels = [75, 17, 17, 75, 63]
-        expected_slice_boxes = mindspore.Tensor([40.5866, 73.2107, 176.1421, 117.1751])
-        self.assertTrue(
-            np.allclose(
-                results["scores"].asnumpy(), expected_scores.asnumpy(), atol=1e-4
-            )
-        )
-        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
-        self.assertTrue(
-            np.allclose(
-                results["boxes"][0, :].asnumpy(), expected_slice_boxes.asnumpy()
-            )
-        )
-
-    @slow
-    def test_inference_object_detection_head_swin_backbone(self):
-        model = DetaForObjectDetection.from_pretrained(
-            "jozhang97/deta-swin-large", from_pt=True
-        )
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        outputs = model(**inputs)
-
-        expected_shape_logits = (1, 300, model.config.num_labels)
-        self.assertEqual(outputs.logits.shape, expected_shape_logits)
-
-        expected_logits = mindspore.Tensor(
-            [
-                [-7.6308, -2.8485, -5.3737],
-                [-7.2037, -4.5505, -4.8027],
-                [-7.2943, -4.2611, -4.6617],
-            ]
-        )
-        expected_boxes = mindspore.Tensor(
-            [
-                [0.4987, 0.4969, 0.9999],
-                [0.2549, 0.5498, 0.4805],
-                [0.5498, 0.2757, 0.0569],
-            ]
-        )
-
-        self.assertTrue(
-            np.allclose(
-                outputs.logits[0, :3, :3].asnumpy(),
-                expected_logits.asnumpy(),
-                atol=1e-4,
-            )
-        )
-
-        expected_shape_boxes = (1, 300, 4)
-        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
-        self.assertTrue(
-            np.allclose(
-                outputs.pred_boxes[0, :3, :3].asnumpy(),
-                expected_boxes.asnumpy(),
-                atol=1e-4,
-            )
-        )
-
-        expected_shape_boxes = (1, 300, 4)
-        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
-        self.assertTrue(
-            np.allclose(
-                outputs.pred_boxes[0, :3, :3].asnumpy(),
-                expected_boxes.asnumpy(),
-                atol=1e-4,
-            )
-        )
-        # verify postprocessing
-        results = image_processor.post_process_object_detection(
-            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
-        )[0]
-
-        expected_scores = mindspore.Tensor([0.6831, 0.6826, 0.5684, 0.5464, 0.4392])
-        expected_labels = [17, 17, 75, 75, 63]
-        expected_slice_boxes = mindspore.Tensor([345.8478, 23.6754, 639.8562, 372.8265])
-
-        self.assertTrue(
-            np.allclose(
-                results["scores"].asnumpy(), expected_scores.asnumpy(), atol=1e-4
-            )
-        )
-        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
-        self.assertTrue(
-            np.allclose(
-                results["boxes"][0, :].asnumpy(), expected_slice_boxes.asnumpy()
-            )
-        )
-
-
-@unittest.skip("No attribute storage")
-def storage_ptr(tensor: mindspore.Tensor) -> int:
-    try:
-        return tensor.untyped_storage().data_ptr()
-    except Exception:
-        # Fallback for torch==1.10
-        try:
-            return tensor.storage().data_ptr()
-        except NotImplementedError:
-            # Fallback for meta storage
-            return 0
-
-
-_float8_e4m3fn = getattr(mindspore, "float8_e4m3fn", None)
-_float8_e5m2 = getattr(mindspore, "float8_e5m2", None)
-_SIZE = {
-    mindspore.int64: 8,
-    mindspore.float32: 4,
-    mindspore.int32: 4,
-    mindspore.bfloat16: 2,
-    mindspore.float16: 2,
-    mindspore.int16: 2,
-    mindspore.uint8: 1,
-    mindspore.int8: 1,
-    mindspore.bool_: 1,
-    mindspore.float64: 8,
-    _float8_e4m3fn: 1,
-    _float8_e5m2: 1,
-}
-
-
-def storage_size(tensor: mindspore.Tensor) -> int:
-    try:
-        return tensor.untyped_storage().nbytes()
-    except AttributeError:
-        # Fallback for torch==1.10
-        try:
-            return tensor.storage().shape * _SIZE[tensor.dtype]
-        except NotImplementedError:
-            # Fallback for meta storage
-            # On torch >=2.0 this is the tensor size
-            return tensor.nelement() * _SIZE[tensor.dtype]
-
-
-def id_tensor_storage(tensor: mindspore.Tensor):
-    """
-    Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For
-    example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is
-    guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with
-    non-overlapping lifetimes may have the same id.
-    """
-
-    unique_id = storage_ptr(tensor)
-
-    return tensor.device, unique_id, storage_size(tensor)
diff --git a/tests/transformers/models/detr/__init__.py b/tests/transformers/models/detr/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/detr/test_image_processing_detr.py b/tests/transformers/models/detr/test_image_processing_detr.py
deleted file mode 100644
index 59009c0ae..000000000
--- a/tests/transformers/models/detr/test_image_processing_detr.py
+++ /dev/null
@@ -1,555 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore DETR processor. """
-# pylint: disable=not-callable
-
-import json
-import pathlib
-import unittest
-
-import numpy as np
-from mindspore import ops
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow, get_tests_dir
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_mindspore_available():
-    import mindspore
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import DetrImageProcessor
-
-
-class DetrImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_pad=True,
-    ):
-        super().__init__()
-        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
-        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_pad = do_pad
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_pad": self.do_pad,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to DetrImageProcessor,
-        assuming do_resize is set to True with a scalar size.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-    def expected_output_image_shape(self, images):
-        height, width = self.get_expected_values(images, batched=True)
-        return self.num_channels, height, width
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = DetrImageProcessor if is_vision_available() else None
-    fixtures_path = pathlib.Path(get_tests_dir()) / 'fixtures/tests_samples/COCO'
-
-    def setUp(self):
-        self.image_processor_tester = DetrImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_rescale"))
-        self.assertTrue(hasattr(image_processing, "rescale_factor"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_pad"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
-        self.assertEqual(image_processor.do_pad, True)
-
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
-        )
-        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
-        self.assertEqual(image_processor.do_pad, False)
-
-    def test_should_raise_if_annotation_format_invalid(self):
-        image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
-
-        with open(self.fixtures_path / "coco_annotations.txt", "r") as f:
-            detection_target = json.loads(f.read())
-
-        annotations = {"image_id": 39769, "annotations": detection_target}
-
-        params = {
-            "images": Image.open(self.fixtures_path / "000000039769.png"),
-            "annotations": annotations,
-            "return_tensors": "pt",
-        }
-
-        image_processor_params = {**image_processor_dict, **{"format": "_INVALID_FORMAT_"}}
-        image_processor = self.image_processing_class(**image_processor_params)
-
-        with self.assertRaises(ValueError) as e:
-            image_processor(**params)
-
-        self.assertTrue(str(e.exception).startswith("_INVALID_FORMAT_ is not a valid AnnotationFormat"))
-
-    def test_valid_coco_detection_annotations(self):
-        # prepare image and target
-        image = Image.open(self.fixtures_path / "000000039769.png")
-        with open(self.fixtures_path / "coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        params = {"image_id": 39769, "annotations": target}
-
-        # encode them
-        image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
-
-        # legal encodings (single image)
-        _ = image_processing(images=image, annotations=params, return_tensors="ms")
-        _ = image_processing(images=image, annotations=[params], return_tensors="ms")
-
-        # legal encodings (batch of one image)
-        _ = image_processing(images=[image], annotations=params, return_tensors="ms")
-        _ = image_processing(images=[image], annotations=[params], return_tensors="ms")
-
-        # legal encoding (batch of more than one image)
-        n = 5
-        _ = image_processing(images=[image] * n, annotations=[params] * n, return_tensors="ms")
-
-        # example of an illegal encoding (missing the 'image_id' key)
-        with self.assertRaises(ValueError) as e:
-            image_processing(images=image, annotations={"annotations": target}, return_tensors="ms")
-
-        self.assertTrue(str(e.exception).startswith("Invalid COCO detection annotations"))
-
-        # example of an illegal encoding (unequal lengths of images and annotations)
-        with self.assertRaises(ValueError) as e:
-            image_processing(images=[image] * n, annotations=[params] * (n - 1), return_tensors="ms")
-
-        self.assertTrue(str(e.exception) == "The number of images (5) and annotations (4) do not match.")
-
-    @slow
-    def test_call_pytorch_with_coco_detection_annotations(self):
-        # prepare image and target
-        image = Image.open(self.fixtures_path / "000000039769.png")
-        with open(self.fixtures_path / "coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"image_id": 39769, "annotations": target}
-
-        # encode them
-        image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
-        encoding = image_processing(images=image, annotations=target, return_tensors="ms")
-
-        # verify pixel values
-        expected_shape = (1, 3, 800, 1066)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = mindspore.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(np.allclose(encoding["pixel_values"][0, 0, 0, :3].numpy(), expected_slice.numpy(), atol=1e-4))
-
-        # verify area
-        expected_area = mindspore.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
-        self.assertTrue(np.allclose(encoding["labels"][0]["area"].numpy(), expected_area.numpy()))
-        # verify boxes
-        expected_boxes_shape = (6, 4)
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = mindspore.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"][0].numpy(), expected_boxes_slice.numpy(), atol=1e-3))
-        # verify image_id
-        expected_image_id = mindspore.tensor([39769])
-        self.assertTrue(np.allclose(encoding["labels"][0]["image_id"].numpy(), expected_image_id.numpy()))
-        # verify is_crowd
-        expected_is_crowd = mindspore.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(np.allclose(encoding["labels"][0]["iscrowd"].numpy(), expected_is_crowd.numpy()))
-        # verify class_labels
-        expected_class_labels = mindspore.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(np.allclose(encoding["labels"][0]["class_labels"].numpy(), expected_class_labels.numpy()))
-        # verify orig_size
-        expected_orig_size = mindspore.tensor([480, 640])
-        self.assertTrue(np.allclose(encoding["labels"][0]["orig_size"].numpy(), expected_orig_size.numpy()))
-        # verify size
-        expected_size = mindspore.tensor([800, 1066])
-        self.assertTrue(np.allclose(encoding["labels"][0]["size"].numpy(), expected_size.numpy()))
-
-    @slow
-    def test_call_pytorch_with_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image = Image.open(self.fixtures_path / "000000039769.png")
-        with open(self.fixtures_path / "coco_panoptic_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-
-        masks_path = pathlib.Path(self.fixtures_path / "coco_panoptic")
-
-        # encode them
-        image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic")
-        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="ms")
-
-        # verify pixel values
-        expected_shape = (1, 3, 800, 1066)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = mindspore.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(np.allclose(encoding["pixel_values"][0, 0, 0, :3].numpy(), expected_slice.numpy(), atol=1e-4))
-
-        # verify area
-        expected_area = mindspore.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
-        self.assertTrue(np.allclose(encoding["labels"][0]["area"].numpy(), expected_area.numpy()))
-        # verify boxes
-        expected_boxes_shape = (6, 4)
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = mindspore.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"][0].numpy(), expected_boxes_slice.numpy(), atol=1e-3))
-        # verify image_id
-        expected_image_id = mindspore.tensor([39769])
-        self.assertTrue(np.allclose(encoding["labels"][0]["image_id"].numpy(), expected_image_id.numpy()))
-        # verify is_crowd
-        expected_is_crowd = mindspore.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(np.allclose(encoding["labels"][0]["iscrowd"].numpy(), expected_is_crowd.numpy()))
-        # verify class_labels
-        expected_class_labels = mindspore.tensor([17, 17, 63, 75, 75, 93])
-        self.assertTrue(np.allclose(encoding["labels"][0]["class_labels"].numpy(), expected_class_labels.numpy()))
-        # verify masks
-        expected_masks_sum = 822873
-        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
-        # verify orig_size
-        expected_orig_size = mindspore.tensor([480, 640])
-        self.assertTrue(np.allclose(encoding["labels"][0]["orig_size"].numpy(), expected_orig_size.numpy()))
-        # verify size
-        expected_size = mindspore.tensor([800, 1066])
-        self.assertTrue(np.allclose(encoding["labels"][0]["size"].numpy(), expected_size.numpy()))
-
-    @slow
-    def test_batched_coco_detection_annotations(self):
-        image_0 = Image.open(self.fixtures_path / "000000039769.png")
-        image_1 = Image.open(self.fixtures_path / "000000039769.png").resize((800, 800))
-        with open(self.fixtures_path / "coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        annotations_0 = {"image_id": 39769, "annotations": target}
-        annotations_1 = {"image_id": 39769, "annotations": target}
-
-        # Adjust the bounding boxes for the resized image
-        w_0, h_0 = image_0.size
-        w_1, h_1 = image_1.size
-        for i in range(len(annotations_1["annotations"])):
-            coords = annotations_1["annotations"][i]["bbox"]
-            new_bbox = [
-                coords[0] * w_1 / w_0,
-                coords[1] * h_1 / h_0,
-                coords[2] * w_1 / w_0,
-                coords[3] * h_1 / h_0,
-            ]
-            annotations_1["annotations"][i]["bbox"] = new_bbox
-
-        images = [image_0, image_1]
-        annotations = [annotations_0, annotations_1]
-
-        image_processing = DetrImageProcessor()
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            return_tensors="ms",  # do_convert_annotations=True
-        )
-
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1066
-        expected_shape = (2, 3, postprocessed_height, postprocessed_width)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        expected_boxes_0 = mindspore.tensor(
-            [
-                [0.6879, 0.4609, 0.0755, 0.3691],
-                [0.2118, 0.3359, 0.2601, 0.1566],
-                [0.5011, 0.5000, 0.9979, 1.0000],
-                [0.5010, 0.5020, 0.9979, 0.9959],
-                [0.3284, 0.5944, 0.5884, 0.8112],
-                [0.8394, 0.5445, 0.3213, 0.9110],
-            ]
-        )
-        expected_boxes_1 = mindspore.tensor(
-            [
-                [0.4130, 0.2765, 0.0453, 0.2215],
-                [0.1272, 0.2016, 0.1561, 0.0940],
-                [0.3757, 0.4933, 0.7488, 0.9865],
-                [0.3759, 0.5002, 0.7492, 0.9955],
-                [0.1971, 0.5456, 0.3532, 0.8646],
-                [0.5790, 0.4115, 0.3430, 0.7161],
-            ]
-        )
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"].numpy(), expected_boxes_0.numpy(), rtol=1e-3))
-        self.assertTrue(np.allclose(encoding["labels"][1]["boxes"].numpy(), expected_boxes_1.numpy(), rtol=1e-3))
-
-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, (6, 800, 1066))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, (6, 800, 1066))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="ms",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = ops.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = ops.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = ops.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = ops.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"].numpy(), expected_boxes_0.numpy(), rtol=1))
-        self.assertTrue(np.allclose(encoding["labels"][1]["boxes"].numpy(), expected_boxes_1.numpy(), rtol=1))
-
-    def test_batched_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
-
-        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-        annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-
-        w_0, h_0 = image_0.size
-        w_1, h_1 = image_1.size
-        for i in range(len(annotation_1["segments_info"])):
-            coords = annotation_1["segments_info"][i]["bbox"]
-            new_bbox = [
-                coords[0] * w_1 / w_0,
-                coords[1] * h_1 / h_0,
-                coords[2] * w_1 / w_0,
-                coords[3] * h_1 / h_0,
-            ]
-            annotation_1["segments_info"][i]["bbox"] = new_bbox
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        images = [image_0, image_1]
-        annotations = [annotation_0, annotation_1]
-
-        # encode them
-        image_processing = DetrImageProcessor(format="coco_panoptic")
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_tensors="ms",
-            return_segmentation_masks=True,
-        )
-
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1066
-        expected_shape = (2, 3, postprocessed_height, postprocessed_width)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        expected_boxes_0 = mindspore.tensor(
-            [
-                [0.2625, 0.5437, 0.4688, 0.8625],
-                [0.7719, 0.4104, 0.4531, 0.7125],
-                [0.5000, 0.4927, 0.9969, 0.9854],
-                [0.1688, 0.2000, 0.2063, 0.0917],
-                [0.5492, 0.2760, 0.0578, 0.2187],
-                [0.4992, 0.4990, 0.9984, 0.9979],
-            ]
-        )
-        expected_boxes_1 = mindspore.tensor(
-            [
-                [0.1576, 0.3262, 0.2814, 0.5175],
-                [0.4634, 0.2463, 0.2720, 0.4275],
-                [0.3002, 0.2956, 0.5985, 0.5913],
-                [0.1013, 0.1200, 0.1238, 0.0550],
-                [0.3297, 0.1656, 0.0347, 0.1312],
-                [0.2997, 0.2994, 0.5994, 0.5987],
-            ]
-        )
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"].numpy(), expected_boxes_0.numpy(), rtol=1e-3))
-        self.assertTrue(np.allclose(encoding["labels"][1]["boxes"].numpy(), expected_boxes_1.numpy(), rtol=1e-3))
-
-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, (6, 800, 1066))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, (6, 800, 1066))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="ms",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = ops.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = ops.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = ops.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = ops.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"].numpy(), expected_boxes_0.numpy(), rtol=1))
-        self.assertTrue(np.allclose(encoding["labels"][1]["boxes"].numpy(), expected_boxes_1.numpy(), rtol=1))
diff --git a/tests/transformers/models/detr/test_modeling_detr.py b/tests/transformers/models/detr/test_modeling_detr.py
deleted file mode 100644
index 3f691e820..000000000
--- a/tests/transformers/models/detr/test_modeling_detr.py
+++ /dev/null
@@ -1,671 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore DETR model. """
-# pylint: disable=not-callable
-
-import pathlib
-import inspect
-import math
-import unittest
-
-import numpy as np
-from mindnlp.transformers import DetrConfig, ResNetConfig
-
-from mindnlp.utils.testing_utils import is_mindspore_available, is_vision_available, require_mindspore, require_vision, slow, get_tests_dir
-from mindnlp.utils import cached_property
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
-#from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-    from mindnlp.transformers import DetrForObjectDetection, DetrForSegmentation, DetrModel
-
-
-if is_vision_available():
-    from PIL import Image
-    from mindnlp.transformers import DetrImageProcessor
-
-
-class DetrModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=8,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=8,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        num_queries=12,
-        num_channels=3,
-        min_size=200,
-        max_size=200,
-        n_targets=8,
-        num_labels=91,
-    ):
-        super().__init__()
-        self.parent = parent
-        self.batch_size = batch_size
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_queries = num_queries
-        self.num_channels = num_channels
-        self.min_size = min_size
-        self.max_size = max_size
-        self.n_targets = n_targets
-        self.num_labels = num_labels
-
-        # we also set the expected seq length for both encoder and decoder
-        self.encoder_seq_length = math.ceil(self.min_size / 32) * math.ceil(self.max_size / 32)
-        self.decoder_seq_length = self.num_queries
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size])
-
-        pixel_mask = ops.ones(self.batch_size, self.min_size, self.max_size)
-
-        labels = None
-        if self.use_labels:
-            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
-            labels = []
-            for i in range(self.batch_size):
-                target = {}
-                target["class_labels"] = ops.randint(
-                    low=0, high=self.num_labels, size=(self.n_targets,)
-                ).astype(mindspore.int32)
-                target["boxes"] = ops.rand(self.n_targets, 4)
-                target["masks"] = ops.rand(self.n_targets, self.min_size, self.max_size)
-                labels.append(target)
-
-        config = self.get_config()
-        return config, pixel_values, pixel_mask, labels
-
-    def get_config(self):
-        resnet_config = ResNetConfig(
-            num_channels=3,
-            embeddings_size=10,
-            hidden_sizes=[10, 20, 30, 40],
-            depths=[1, 1, 2, 1],
-            hidden_act="relu",
-            num_labels=3,
-            out_features=["stage2", "stage3", "stage4"],
-            out_indices=[2, 3, 4],
-        )
-        return DetrConfig(
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            num_queries=self.num_queries,
-            num_labels=self.num_labels,
-            use_timm_backbone=False,
-            backbone_config=resnet_config,
-            backbone=None,
-            use_pretrained_backbone=False,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
-        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
-        return config, inputs_dict
-
-    def create_and_check_detr_model(self, config, pixel_values, pixel_mask, labels):
-        model = DetrModel(config=config)
-        model.set_train(False)
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_detr_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
-        model = DetrForObjectDetection(config=config)
-        model.set_train(False)
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
-
-
-@require_mindspore
-class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            DetrModel,
-            DetrForObjectDetection,
-            DetrForSegmentation,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "image-feature-extraction": DetrModel,
-            "image-segmentation": DetrForSegmentation,
-            "object-detection": DetrForObjectDetection,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_torchscript = False
-    test_pruning = False
-    test_head_masking = False
-    test_missing_keys = False
-    zero_init_hidden_state = True
-
-    # special case for head models
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ in ["DetrForObjectDetection", "DetrForSegmentation"]:
-                labels = []
-                for i in range(self.model_tester.batch_size):
-                    target = {}
-                    target["class_labels"] = ops.ones(
-                        self.model_tester.n_targets, dtype=mindspore.int64
-                    )
-                    target["boxes"] = ops.ones(
-                        self.model_tester.n_targets, 4, dtype=mindspore.float32
-                    )
-                    target["masks"] = ops.ones(
-                        self.model_tester.n_targets,
-                        self.model_tester.min_size,
-                        self.model_tester.max_size,
-                        dtype=mindspore.float32,
-                    )
-                    labels.append(target)
-                inputs_dict["labels"] = labels
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = DetrModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DetrConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_detr_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_detr_model(*config_and_inputs)
-
-    def test_detr_object_detection_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_detr_object_detection_head_model(*config_and_inputs)
-
-    # TODO: check if this works again for MindSpore 2.x.y
-    @unittest.skip(reason="Got `CUDA error: misaligned address` with MindSpore 2.0.0.")
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-    @unittest.skip(reason="DETR does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="DETR does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(reason="DETR does not have a get_input_embeddings method")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="DETR is not a generative model")
-    def test_generate_without_input_ids(self):
-        pass
-
-    @unittest.skip(reason="DETR does not use token embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @slow
-    def test_model_outputs_equivalence(self):
-        # TODO Niels: fix me!
-        pass
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        decoder_seq_length = self.model_tester.decoder_seq_length
-        encoder_seq_length = self.model_tester.encoder_seq_length
-        decoder_key_length = self.model_tester.decoder_seq_length
-        encoder_key_length = self.model_tester.encoder_seq_length
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            if self.is_encoder_decoder:
-                correct_outlen = 5
-
-                # loss is at first position
-                if "labels" in inputs_dict:
-                    correct_outlen += 1  # loss is added to beginning
-                # Object Detection model returns pred_logits and pred_boxes
-                if model_class.__name__ == "DetrForObjectDetection":
-                    correct_outlen += 2
-                # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks
-                if model_class.__name__ == "DetrForSegmentation":
-                    correct_outlen += 3
-                if "past_key_values" in outputs:
-                    correct_outlen += 1  # past_key_values have been returned
-
-                self.assertEqual(out_len, correct_outlen)
-
-                # decoder attentions
-                decoder_attentions = outputs.decoder_attentions
-                self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
-
-                # cross attentions
-                cross_attentions = outputs.cross_attentions
-                self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(cross_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        encoder_key_length,
-                    ],
-                )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-
-    @unittest.skip("MindSpore has no .grad")
-    def test_retain_grad_hidden_states_attentions(self):
-        # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        inputs = self._prepare_for_class(inputs_dict, model_class)
-
-        outputs = model(**inputs)
-
-        output = outputs[0]
-
-        encoder_hidden_states = outputs.encoder_hidden_states[0]
-        encoder_attentions = outputs.encoder_attentions[0]
-        encoder_hidden_states.retain_grad()
-        encoder_attentions.retain_grad()
-
-        decoder_attentions = outputs.decoder_attentions[0]
-        decoder_attentions.retain_grad()
-
-        cross_attentions = outputs.cross_attentions[0]
-        cross_attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(encoder_hidden_states.grad)
-        self.assertIsNotNone(encoder_attentions.grad)
-        self.assertIsNotNone(decoder_attentions.grad)
-        self.assertIsNotNone(cross_attentions.grad)
-
-    def test_forward_auxiliary_loss(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.auxiliary_loss = True
-
-        # only test for object detection and segmentation model
-        for model_class in self.all_model_classes[1:]:
-            model = model_class(config)
-
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-            outputs = model(**inputs)
-
-            self.assertIsNotNone(outputs.auxiliary_outputs)
-            self.assertEqual(len(outputs.auxiliary_outputs), self.model_tester.num_hidden_layers - 1)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model.config.is_encoder_decoder:
-                expected_arg_names = ["pixel_values", "pixel_mask"]
-                expected_arg_names.extend(
-                    ["head_mask", "decoder_head_mask", "encoder_outputs"]
-                    if "head_mask" in arg_names and "decoder_head_mask" in arg_names
-                    else []
-                )
-                self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-            else:
-                expected_arg_names = ["pixel_values", "pixel_mask"]
-                self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    @unittest.skip("MindNLP does not depend on timm")
-    def test_different_timm_backbone(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # let's pick a random timm backbone
-        config.backbone = "tf_mobilenetv3_small_075"
-        config.backbone_config = None
-        config.use_timm_backbone = True
-        config.backbone_kwargs = {"out_indices": [2, 3, 4]}
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if model_class.__name__ == "DetrForObjectDetection":
-                expected_shape = (
-                    self.model_tester.batch_size,
-                    self.model_tester.num_queries,
-                    self.model_tester.num_labels + 1,
-                )
-                self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
-                self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
-            elif model_class.__name__ == "DetrForSegmentation":
-                # Confirm out_indices was propogated to backbone
-                self.assertEqual(len(model.detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
-            else:
-                # Confirm out_indices was propogated to backbone
-                self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
-
-            self.assertTrue(outputs)
-
-    def test_greyscale_images(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # use greyscale pixel values
-        inputs_dict["pixel_values"] = floats_tensor(
-            [self.model_tester.batch_size, 1, self.model_tester.min_size, self.model_tester.max_size]
-        )
-
-        # let's set num_channels to 1
-        config.num_channels = 1
-        config.backbone_config.num_channels = 1
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            self.assertTrue(outputs)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        configs_no_init.init_xavier_std = 1e9
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    if "bbox_attention" in name and "bias" not in name:
-                        self.assertLess(
-                            100000,
-                            abs(param.data.max().item()),
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    @unittest.skip("MindNLP AutoModel.from_pretrained() not compatible")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-
-TOLERANCE = 1e-4
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open(pathlib.Path(get_tests_dir())  / "fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@unittest.skip("MindNLP does not depend on timm")
-@require_vision
-@slow
-class DetrModelIntegrationTestsTimmBackbone(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") if is_vision_available() else None
-
-    def test_inference_no_head(self):
-        model = DetrModel.from_pretrained("facebook/detr-resnet-50")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        encoding = image_processor(images=image, return_tensors="ms")
-
-        outputs = model(**encoding)
-
-        expected_shape = (1, 100, 256)
-        assert outputs.last_hidden_state.shape == expected_shape
-        expected_slice = mindspore.tensor(
-            [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]]
-        )
-        self.assertTrue(np.allclose(outputs.last_hidden_state[0, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
-
-    def test_inference_object_detection_head(self):
-        model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        encoding = image_processor(images=image, return_tensors="ms")
-        pixel_values = encoding["pixel_values"]
-        pixel_mask = encoding["pixel_mask"]
-
-        outputs = model(pixel_values, pixel_mask)
-
-        # verify outputs
-        expected_shape_logits = (1, model.config.num_queries, model.config.num_labels + 1)
-        self.assertEqual(outputs.logits.shape, expected_shape_logits)
-        expected_slice_logits = mindspore.tensor(
-            [[-19.1194, -0.0893, -11.0154], [-17.3640, -1.8035, -14.0219], [-20.0461, -0.5837, -11.1060]]
-        )
-        self.assertTrue(np.allclose(outputs.logits[0, :3, :3].numpy(), expected_slice_logits.numpy(), atol=1e-4))
-
-        expected_shape_boxes = (1, model.config.num_queries, 4)
-        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
-        expected_slice_boxes = mindspore.tensor(
-            [[0.4433, 0.5302, 0.8853], [0.5494, 0.2517, 0.0529], [0.4998, 0.5360, 0.9956]]
-        )
-        self.assertTrue(np.allclose(outputs.pred_boxes[0, :3, :3].numpy(), expected_slice_boxes.numpy(), atol=1e-4))
-
-        # verify postprocessing
-        results = image_processor.post_process_object_detection(
-            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
-        )[0]
-        expected_scores = mindspore.tensor([0.9982, 0.9960, 0.9955, 0.9988, 0.9987])
-        expected_labels = [75, 75, 63, 17, 17]
-        expected_slice_boxes = mindspore.tensor([40.1633, 70.8115, 175.5471, 117.9841])
-
-        self.assertEqual(len(results["scores"]), 5)
-        self.assertTrue(np.allclose(results["scores"].numpy(), expected_scores.numpy(), atol=1e-4))
-        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
-        self.assertTrue(np.allclose(results["boxes"][0, :].numpy(), expected_slice_boxes.numpy()))
-
-    def test_inference_panoptic_segmentation_head(self):
-        model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        encoding = image_processor(images=image, return_tensors="ms")
-        pixel_values = encoding["pixel_values"]
-        pixel_mask = encoding["pixel_mask"]
-
-        outputs = model(pixel_values, pixel_mask)
-
-        # verify outputs
-        expected_shape_logits = (1, model.config.num_queries, model.config.num_labels + 1)
-        self.assertEqual(outputs.logits.shape, expected_shape_logits)
-        expected_slice_logits = mindspore.tensor(
-            [[-18.1565, -1.7568, -13.5029], [-16.8888, -1.4138, -14.1028], [-17.5709, -2.5080, -11.8654]]
-        )
-        self.assertTrue(np.allclose(outputs.logits[0, :3, :3].numpy(), expected_slice_logits.numpy(), atol=1e-4))
-
-        expected_shape_boxes = (1, model.config.num_queries, 4)
-        self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
-        expected_slice_boxes = mindspore.tensor(
-            [[0.5344, 0.1789, 0.9285], [0.4420, 0.0572, 0.0875], [0.6630, 0.6887, 0.1017]]
-        )
-        self.assertTrue(np.allclose(outputs.pred_boxes[0, :3, :3].numpy(), expected_slice_boxes.numpy(), atol=1e-4))
-
-        expected_shape_masks = (1, model.config.num_queries, 200, 267)
-        self.assertEqual(outputs.pred_masks.shape, expected_shape_masks)
-        expected_slice_masks = mindspore.tensor(
-            [[-7.7558, -10.8788, -11.9797], [-11.8881, -16.4329, -17.7451], [-14.7316, -19.7383, -20.3004]]
-        )
-        self.assertTrue(np.allclose(outputs.pred_masks[0, 0, :3, :3].numpy(), expected_slice_masks.numpy(), atol=1e-3))
-
-        # verify postprocessing
-        results = image_processor.post_process_panoptic_segmentation(
-            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
-        )[0]
-
-        expected_shape = (480, 640)
-        expected_slice_segmentation = mindspore.tensor([[4, 4, 4], [4, 4, 4], [4, 4, 4]], dtype=mindspore.int32)
-        expected_number_of_segments = 5
-        expected_first_segment = {"id": 1, "label_id": 17, "was_fused": False, "score": 0.994097}
-
-        number_of_unique_segments = len(ops.unique(results["segmentation"]))
-        self.assertTrue(
-            number_of_unique_segments, expected_number_of_segments + 1
-        )  # we add 1 for the background class
-        self.assertTrue(results["segmentation"].shape, expected_shape)
-        self.assertTrue(np.allclose(results["segmentation"][:3, :3].numpy(), expected_slice_segmentation.numpy(), atol=1e-4))
-        self.assertTrue(len(results["segments_info"]), expected_number_of_segments)
-        self.assertDictEqual(results["segments_info"][0], expected_first_segment)
-
-
-@require_vision
-@require_mindspore
-@slow
-class DetrModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", revision="no_timm")
-            if is_vision_available()
-            else None
-        )
-
-    @unittest.skip("MindNLP does not depend on timm")
-    def test_inference_no_head(self):
-        model = DetrModel.from_pretrained("facebook/detr-resnet-50", revision="no_timm")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        encoding = image_processor(images=image, return_tensors="ms")
-
-        outputs = model(**encoding)
-
-        expected_shape = (1, 100, 256)
-        assert outputs.last_hidden_state.shape == expected_shape
-        expected_slice = mindspore.tensor(
-            [[0.0616, -0.5146, -0.4032], [-0.7629, -0.4934, -1.7153], [-0.4768, -0.6403, -0.7826]]
-        )
-        self.assertTrue(np.allclose(outputs.last_hidden_state[0, :3, :3].numpy(), expected_slice.numpy(), atol=1e-4))
diff --git a/tests/transformers/models/dinov2/__init__.py b/tests/transformers/models/dinov2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/dinov2/test_modeling_dinov2.py b/tests/transformers/models/dinov2/test_modeling_dinov2.py
deleted file mode 100644
index 4417ba667..000000000
--- a/tests/transformers/models/dinov2/test_modeling_dinov2.py
+++ /dev/null
@@ -1,327 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Dinov2 model."""
-
-import unittest
-
-from mindnlp.transformers import Dinov2Config
-from mindnlp.utils.testing_utils import is_flaky, require_mindspore, slow
-from mindnlp.utils import cached_property, is_mindspore_available
-
-from ...test_backbone_common import BackboneTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-import numpy as np
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn
-    from mindnlp.transformers import Dinov2ForImageClassification, Dinov2Model, Dinov2Backbone, Dinov2PreTrainedModel
-    from PIL import Image
-    from mindnlp.transformers import AutoImageProcessor
-
-
-class Dinov2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in Dinov2, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return Dinov2Config(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = Dinov2Model(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = Dinov2Backbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify hidden states
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        expected_size = self.image_size // config.patch_size
-        self.parent.assertListEqual(
-            list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], expected_size, expected_size]
-        )
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = Dinov2Backbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(
-            list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], expected_size, expected_size]
-        )
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-
-        # verify backbone works with apply_layernorm=False and reshape_hidden_states=False
-        config.apply_layernorm = False
-        config.reshape_hidden_states = False
-
-        model = Dinov2Backbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(
-            list(result.feature_maps[0].shape), [self.batch_size, self.seq_length, self.hidden_size]
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = Dinov2ForImageClassification(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = Dinov2ForImageClassification(config)
-        model.set_train(False)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class Dinov2ModelTest(ModelTesterMixin,  unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Dinov2 does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            Dinov2ForImageClassification,
-            Dinov2Backbone,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"image-feature-extraction": Dinov2Model, "image-classification": Dinov2ForImageClassification}
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = True
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = Dinov2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Dinov2Config, has_text_modality=False, hidden_size=37)
-
-    @is_flaky(max_attempts=3, description="measure of timing is somehow flaky.")
-    def test_initialization(self):
-        super().test_initialization()
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="Dinov2 does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-    
-    # override since we have embeddings / LM heads over multiple codebooks
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Dense))
-    
-    # def test_model_get_set_embeddings(self):
-    #     config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-    #     for model_class in self.all_model_classes:
-    #         model = model_class(config)
-    #         self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-    #         x = model.get_output_embeddings()
-    #         self.assertTrue(x is None or isinstance(x, nn.Dense))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_backbone(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @unittest.skip(reason="Dinov2 does not support feedforward chunking yet")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/dinov2-base"
-        model = Dinov2Model.from_pretrained(model_name, from_pt=True)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-
-class Dinov2ModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("facebook/dinov2-base")
-
-    @slow
-    def test_inference_no_head(self):
-        model = Dinov2Model.from_pretrained("facebook/dinov2-base")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-        # forward pass
-        
-        outputs = model(**inputs)
-
-        # verify the last hidden states
-        expected_shape = (1, 257, 768)
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[-2.1747, -0.4729, 1.0936], [-3.2780, -0.8269, -0.9210], [-2.9129, 1.1284, -0.7306]]
-        )
-        # [[-2.1757212  -0.43772224  1.0883933 ],[-3.7096467  -1.1127033  -1.3176756 ],[-2.9151366   1.1522964  -0.77162194]]
-        print(outputs.last_hidden_state[0, :3, :3].asnumpy())
-        self.assertTrue(np.allclose(outputs.last_hidden_state[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-3))
-
-
-@require_mindspore
-class Dinov2BackboneTest(unittest.TestCase, BackboneTesterMixin):
-    all_model_classes = (Dinov2Backbone,) if is_mindspore_available() else ()
-    config_class = Dinov2Config
-
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = Dinov2ModelTester(self)
diff --git a/tests/transformers/models/donut/__init__.py b/tests/transformers/models/donut/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/donut/test_image_processing_donut.py b/tests/transformers/models/donut/test_image_processing_donut.py
deleted file mode 100644
index 565314bf9..000000000
--- a/tests/transformers/models/donut/test_image_processing_donut.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from mindnlp.utils.testing_utils import is_flaky, require_mindspore, require_vision
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindspore import ops
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import DonutImageProcessor
-
-
-class DonutImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_thumbnail=True,
-        do_align_axis=False,
-        do_pad=True,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size if size is not None else {"height": 18, "width": 20}
-        self.do_thumbnail = do_thumbnail
-        self.do_align_axis = do_align_axis
-        self.do_pad = do_pad
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_thumbnail": self.do_thumbnail,
-            "do_align_long_axis": self.do_align_axis,
-            "do_pad": self.do_pad,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.size["height"], self.size["width"]
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class DonutImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = DonutImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = DonutImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_thumbnail"))
-        self.assertTrue(hasattr(image_processing, "do_align_long_axis"))
-        self.assertTrue(hasattr(image_processing, "do_pad"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"height": 18, "width": 20})
-
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
-
-        # Previous config had dimensions in (width, height) order
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=(42, 84))
-        self.assertEqual(image_processor.size, {"height": 84, "width": 42})
-
-    @is_flaky()
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="ms").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.size["height"],
-                self.image_processor_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="ms").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.size["height"],
-                self.image_processor_tester.size["width"],
-            ),
-        )
-
-    @is_flaky()
-    def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="ms").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.size["height"],
-                self.image_processor_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="ms").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.size["height"],
-                self.image_processor_tester.size["width"],
-            ),
-        )
-
-
-    #TODO
-    @is_flaky()
-    def test_call_mindspore(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random mindspore tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, ms.Tensor)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="ms").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                1,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.size["height"],
-                self.image_processor_tester.size["width"],
-            ),
-        )
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="ms").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.size["height"],
-                self.image_processor_tester.size["width"],
-            ),
-        )
\ No newline at end of file
diff --git a/tests/transformers/models/donut/test_modeling_donut_swin.py b/tests/transformers/models/donut/test_modeling_donut_swin.py
deleted file mode 100644
index 8c24074eb..000000000
--- a/tests/transformers/models/donut/test_modeling_donut_swin.py
+++ /dev/null
@@ -1,349 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Donut Swin model."""
-
-import collections
-import unittest
-
-from mindnlp.transformers import DonutSwinConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow
-from mindnlp.utils import is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import DonutSwinModel
-
-
-class DonutSwinModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=32,
-        patch_size=2,
-        num_channels=3,
-        embed_dim=16,
-        depths=[1, 2, 1],
-        num_heads=[2, 2, 4],
-        window_size=2,
-        mlp_ratio=2.0,
-        qkv_bias=True,
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        drop_path_rate=0.1,
-        hidden_act="gelu",
-        use_absolute_embeddings=False,
-        patch_norm=True,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        is_training=True,
-        scope=None,
-        use_labels=True,
-        type_sequence_label_size=10,
-        encoder_stride=8,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.embed_dim = embed_dim
-        self.depths = depths
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.mlp_ratio = mlp_ratio
-        self.qkv_bias = qkv_bias
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.drop_path_rate = drop_path_rate
-        self.hidden_act = hidden_act
-        self.use_absolute_embeddings = use_absolute_embeddings
-        self.patch_norm = patch_norm
-        self.layer_norm_eps = layer_norm_eps
-        self.initializer_range = initializer_range
-        self.is_training = is_training
-        self.scope = scope
-        self.use_labels = use_labels
-        self.type_sequence_label_size = type_sequence_label_size
-        self.encoder_stride = encoder_stride
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return DonutSwinConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            embed_dim=self.embed_dim,
-            depths=self.depths,
-            num_heads=self.num_heads,
-            window_size=self.window_size,
-            mlp_ratio=self.mlp_ratio,
-            qkv_bias=self.qkv_bias,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            drop_path_rate=self.drop_path_rate,
-            hidden_act=self.hidden_act,
-            use_absolute_embeddings=self.use_absolute_embeddings,
-            path_norm=self.patch_norm,
-            layer_norm_eps=self.layer_norm_eps,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = DonutSwinModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        expected_seq_len = ((config.image_size // config.patch_size) ** 2) // (4 ** (len(config.depths) - 1))
-        expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class DonutSwinModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (DonutSwinModel,) if is_mindspore_available() else ()
-    fx_compatible = True
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = DonutSwinModelTester(self)
-        # self.config_tester = ConfigTester(self, config_class=DonutSwinConfig, embed_dim=37)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=DonutSwinConfig,
-            has_text_modality=False,
-            embed_dim=37,
-            common_properties=["image_size", "patch_size", "num_channels"],
-        )
-
-    def test_config(self):
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="DonutSwin does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Dense))
-
-    @unittest.skip(reason="DonutSwin does not output any loss term in the forward pass")
-    def test_training(self):
-        pass
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            expected_num_attentions = len(self.model_tester.depths)
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            window_size_squared = config.window_size**2
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_heads[0], window_size_squared, window_size_squared],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            else:
-                # also another +1 for reshaped_hidden_states
-                added_hidden_states = 2
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), expected_num_attentions)
-
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_heads[0], window_size_squared, window_size_squared],
-            )
-
-    def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
-        model = model_class(config)
-
-        model.set_train(False)
-
-
-        outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-        hidden_states = outputs.hidden_states
-
-        expected_num_layers = getattr(
-            self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
-        )
-        self.assertEqual(len(hidden_states), expected_num_layers)
-
-        # DonutSwin has a different seq_length
-        patch_size = (
-            config.patch_size
-            if isinstance(config.patch_size, collections.abc.Iterable)
-            else (config.patch_size, config.patch_size)
-        )
-
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-
-        self.assertListEqual(
-            list(hidden_states[0].shape[-2:]),
-            [num_patches, self.model_tester.embed_dim],
-        )
-
-        reshaped_hidden_states = outputs.reshaped_hidden_states
-        self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
-
-        batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
-        reshaped_hidden_states = (
-            reshaped_hidden_states[0].view(batch_size, num_channels, height * width).permute(0, 2, 1)
-        )
-        self.assertListEqual(
-            list(reshaped_hidden_states.shape[-2:]),
-            [num_patches, self.model_tester.embed_dim],
-        )
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        image_size = (
-            self.model_tester.image_size
-            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
-            else (self.model_tester.image_size, self.model_tester.image_size)
-        )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-    def test_hidden_states_output_with_padding(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.patch_size = 3
-
-        image_size = (
-            self.model_tester.image_size
-            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
-            else (self.model_tester.image_size, self.model_tester.image_size)
-        )
-        patch_size = (
-            config.patch_size
-            if isinstance(config.patch_size, collections.abc.Iterable)
-            else (config.patch_size, config.patch_size)
-        )
-
-        padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0])
-        padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1])
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "naver-clova-ix/donut-base"
-        model = DonutSwinModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if "embeddings" not in name and param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
\ No newline at end of file
diff --git a/tests/transformers/models/donut/test_processing_donut.py b/tests/transformers/models/donut/test_processing_donut.py
deleted file mode 100644
index 9b510de4a..000000000
--- a/tests/transformers/models/donut/test_processing_donut.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from mindnlp.transformers import DonutProcessor
-
-
-class DonutProcessorTest(unittest.TestCase):
-    from_pretrained_id = "naver-clova-ix/donut-base"
-
-    def setUp(self):
-        self.processor = DonutProcessor.from_pretrained(self.from_pretrained_id)
-
-    def test_token2json(self):
-        expected_json = {
-            "name": "John Doe",
-            "age": "99",
-            "city": "Atlanta",
-            "state": "GA",
-            "zip": "30301",
-            "phone": "123-4567",
-            "nicknames": [{"nickname": "Johnny"}, {"nickname": "JD"}],
-            "multiline": "text\nwith\nnewlines",
-            "empty": "",
-        }
-
-        sequence = (
-            "<s_name>John Doe</s_name><s_age>99</s_age><s_city>Atlanta</s_city>"
-            "<s_state>GA</s_state><s_zip>30301</s_zip><s_phone>123-4567</s_phone>"
-            "<s_nicknames><s_nickname>Johnny</s_nickname>"
-            "<sep/><s_nickname>JD</s_nickname></s_nicknames>"
-            "<s_multiline>text\nwith\nnewlines</s_multiline>"
-            "<s_empty></s_empty>"
-        )
-        actual_json = self.processor.token2json(sequence)
-
-        self.assertDictEqual(actual_json, expected_json)
\ No newline at end of file
diff --git a/tests/transformers/models/dpr/__init__.py b/tests/transformers/models/dpr/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/dpr/test_modeling_dpr.py b/tests/transformers/models/dpr/test_modeling_dpr.py
deleted file mode 100644
index 5c1b70ae6..000000000
--- a/tests/transformers/models/dpr/test_modeling_dpr.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Huggingface
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import tempfile
-import unittest
-import numpy as np
-
-from mindnlp.transformers import DPRConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import DPRContextEncoder, DPRQuestionEncoder, DPRReader, DPRReaderTokenizer
-
-
-class DPRModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=False,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        projection_dim=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.projection_dim = projection_dim
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return DPRConfig(
-            projection_dim=self.projection_dim,
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_context_encoder(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DPRContextEncoder(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size))
-
-    def create_and_check_question_encoder(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DPRQuestionEncoder(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size))
-
-    def create_and_check_reader(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DPRReader(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-        )
-
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.relevance_logits.shape, (self.batch_size,))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids}
-        return config, inputs_dict
-
-
-@require_mindspore
-class DPRModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            DPRContextEncoder,
-            DPRQuestionEncoder,
-            DPRReader,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = {"feature-extraction": DPRQuestionEncoder} if is_mindspore_available() else {}
-
-    test_resize_embeddings = False
-    test_missing_keys = False  # why?
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = DPRModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DPRConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_context_encoder_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_context_encoder(*config_and_inputs)
-
-    def test_question_encoder_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_question_encoder(*config_and_inputs)
-
-    def test_reader_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reader(*config_and_inputs)
-
-    def test_init_changed_config(self):
-        config = self.model_tester.prepare_config_and_inputs()[0]
-
-        model = DPRQuestionEncoder(config=config)
-        model.set_train(False)
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            model.save_pretrained(tmp_dirname)
-            model = DPRQuestionEncoder.from_pretrained(tmp_dirname, projection_dim=512, from_pt=True)
-
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/dpr-ctx_encoder-single-nq-base"
-        model = DPRContextEncoder.from_pretrained(model_name, from_pt=True)
-        self.assertIsNotNone(model)
-
-        model_name = "facebook/dpr-ctx_encoder-single-nq-base"
-        model = DPRContextEncoder.from_pretrained(model_name, from_pt=True)
-        self.assertIsNotNone(model)
-
-        model_name = "facebook/dpr-ctx_encoder-single-nq-base"
-        model = DPRQuestionEncoder.from_pretrained(model_name, from_pt=True)
-        self.assertIsNotNone(model)
-
-        model_name = "facebook/dpr-ctx_encoder-single-nq-base"
-        model = DPRReader.from_pretrained(model_name, from_pt=True)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class DPRModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head(self):
-        model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base", return_dict=False, from_pt=True)
-
-        input_ids = mindspore.tensor(
-            [[101, 7592, 1010, 2003, 2026, 3899, 10140, 1029, 102]], dtype=mindspore.int64,
-        )  # [CLS] hello, is my dog cute? [SEP]
-        output = model(input_ids)[0]  # embedding shape = (1, 768)
-        # compare the actual values for a slice.
-        expected_slice = mindspore.tensor(
-            [
-                [
-                    0.03236253,
-                    0.12753335,
-                    0.16818509,
-                    0.00279786,
-                    0.3896933,
-                    0.24264945,
-                    0.2178971,
-                    -0.02335227,
-                    -0.08481959,
-                    -0.14324117,
-                ]
-            ],
-            dtype=mindspore.float32
-        )
-        self.assertTrue(np.allclose(output[:, :10].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-    @slow
-    def test_reader_inference(self):
-        tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base", from_pt=True)
-        model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base", from_pt=True)
-
-        encoded_inputs = tokenizer(
-            questions="What is love ?",
-            titles="Haddaway",
-            texts="What Is Love is a song recorded by the artist Haddaway",
-            padding=True,
-            return_tensors="ms",
-        )
-
-        outputs = model(**encoded_inputs)
-
-        # compare the actual values for a slice.
-        expected_start_logits = mindspore.tensor(
-            [[-10.3005, -10.7765, -11.4872, -11.6841, -11.9312, -10.3002, -9.8544, -11.7378, -12.0821, -10.2975]],
-            dtype=mindspore.float32,
-        )
-
-        expected_end_logits = mindspore.tensor(
-            [[-11.0684, -11.7041, -11.5397, -10.3465, -10.8791, -6.8443, -11.9959, -11.0364, -10.0096, -6.8405]],
-            dtype=mindspore.float32
-        )
-        self.assertTrue(np.allclose(outputs.start_logits[:, :10].asnumpy(), expected_start_logits.asnumpy(), atol=1e-4))
-        self.assertTrue(np.allclose(outputs.end_logits[:, :10].asnumpy(), expected_end_logits.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/dpr/test_tokenization_dpr.py b/tests/transformers/models/dpr/test_tokenization_dpr.py
deleted file mode 100644
index fe12cd94e..000000000
--- a/tests/transformers/models/dpr/test_tokenization_dpr.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Huggingface
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from mindnlp.transformers import (
-    DPRContextEncoderTokenizer,
-    DPRContextEncoderTokenizerFast,
-    DPRQuestionEncoderTokenizer,
-    DPRQuestionEncoderTokenizerFast,
-    DPRReaderOutput,
-    DPRReaderTokenizer,
-    DPRReaderTokenizerFast,
-)
-from mindnlp.utils.testing_utils import require_tokenizers, slow
-from mindnlp.transformers.tokenization_utils_base import BatchEncoding
-
-from ..bert.test_tokenization_bert import BertTokenizationTest
-
-
-@require_tokenizers
-class DPRContextEncoderTokenizationTest(BertTokenizationTest):
-    tokenizer_class = DPRContextEncoderTokenizer
-    rust_tokenizer_class = DPRContextEncoderTokenizerFast
-    test_rust_tokenizer = True
-    from_pretrained_id = "facebook/dpr-ctx_encoder-single-nq-base"
-
-
-@require_tokenizers
-class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
-    tokenizer_class = DPRQuestionEncoderTokenizer
-    rust_tokenizer_class = DPRQuestionEncoderTokenizerFast
-    test_rust_tokenizer = True
-    from_pretrained_id = "facebook/dpr-ctx_encoder-single-nq-base"
-
-
-@require_tokenizers
-class DPRReaderTokenizationTest(BertTokenizationTest):
-    tokenizer_class = DPRReaderTokenizer
-    rust_tokenizer_class = DPRReaderTokenizerFast
-    test_rust_tokenizer = True
-    from_pretrained_id = "facebook/dpr-ctx_encoder-single-nq-base"
-
-    @slow
-    def test_decode_best_spans(self):
-        tokenizer = self.tokenizer_class.from_pretrained("google-bert/bert-base-uncased", from_pt=True)
-
-        text_1 = tokenizer.encode("question sequence", add_special_tokens=False)
-        text_2 = tokenizer.encode("title sequence", add_special_tokens=False)
-        text_3 = tokenizer.encode("text sequence " * 4, add_special_tokens=False)
-        input_ids = [[101] + text_1 + [102] + text_2 + [102] + text_3]
-        reader_input = BatchEncoding({"input_ids": input_ids})
-
-        start_logits = [[0] * len(input_ids[0])]
-        end_logits = [[0] * len(input_ids[0])]
-        relevance_logits = [0]
-        reader_output = DPRReaderOutput(start_logits, end_logits, relevance_logits)
-
-        start_index, end_index = 8, 9
-        start_logits[0][start_index] = 10
-        end_logits[0][end_index] = 10
-        predicted_spans = tokenizer.decode_best_spans(reader_input, reader_output)
-        self.assertEqual(predicted_spans[0].start_index, start_index)
-        self.assertEqual(predicted_spans[0].end_index, end_index)
-        self.assertEqual(predicted_spans[0].doc_id, 0)
-
-    @slow
-    def test_call(self):
-        tokenizer = self.tokenizer_class.from_pretrained("google-bert/bert-base-uncased", from_pt=True)
-
-        text_1 = tokenizer.encode("question sequence", add_special_tokens=False)
-        text_2 = tokenizer.encode("title sequence", add_special_tokens=False)
-        text_3 = tokenizer.encode("text sequence", add_special_tokens=False)
-        expected_input_ids = [101] + text_1 + [102] + text_2 + [102] + text_3
-        encoded_input = tokenizer(questions=["question sequence"], titles=["title sequence"], texts=["text sequence"])
-        self.assertIn("input_ids", encoded_input)
-        self.assertIn("attention_mask", encoded_input)
-        self.assertListEqual(encoded_input["input_ids"][0], expected_input_ids)
diff --git a/tests/transformers/models/dpt/__init__.py b/tests/transformers/models/dpt/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/dpt/test_image_processing_dpt.py b/tests/transformers/models/dpt/test_image_processing_dpt.py
deleted file mode 100644
index 209d33b5b..000000000
--- a/tests/transformers/models/dpt/test_image_processing_dpt.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from mindnlp.utils import is_vision_available
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_vision_available():
-    from mindnlp.transformers import DPTImageProcessor
-
-
-class DPTImageProcessingTester(unittest.TestCase):
-    def __init__(
-            self,
-            parent,
-            batch_size=7,
-            num_channels=3,
-            image_size=18,
-            min_resolution=30,
-            max_resolution=400,
-            do_resize=True,
-            size=None,
-            do_normalize=True,
-            image_mean=None,
-            image_std=None,
-    ):
-        super().__init__()
-        if image_std is None:
-            image_std = [0.5, 0.5, 0.5]
-        if image_mean is None:
-            image_mean = [0.5, 0.5, 0.5]
-        size = size if size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_image_processor_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.size["height"], self.size["width"]
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class DPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = DPTImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = DPTImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_rescale"))
-        self.assertTrue(hasattr(image_processing, "rescale_factor"))
-        self.assertTrue(hasattr(image_processing, "do_pad"))
-        self.assertTrue(hasattr(image_processing, "size_divisor"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
-
-    def test_padding(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        image = np.random.randn(3, 249, 491)
-
-        # test individual method
-        image = image_processing.pad_image(image, size_divisor=4)
-        self.assertTrue(image.shape[1] % 4 == 0)
-        self.assertTrue(image.shape[2] % 4 == 0)
-
-        # test by calling
-        pixel_values = image_processing.preprocess(
-            image, do_rescale=False, do_resize=False, do_pad=True, size_divisor=4, return_tensors="ms"
-        ).pixel_values
-        self.assertTrue(pixel_values.shape[2] % 4 == 0)
-        self.assertTrue(pixel_values.shape[3] % 4 == 0)
-
-    def test_keep_aspect_ratio(self):
-        size = {"height": 512, "width": 512}
-        image_processor = DPTImageProcessor(size=size, keep_aspect_ratio=True, ensure_multiple_of=32)
-
-        image = np.zeros((489, 640, 3))
-
-        pixel_values = image_processor(image, return_tensors="ms").pixel_values
-
-        self.assertEqual(list(pixel_values.shape), [1, 3, 512, 672])
diff --git a/tests/transformers/models/dpt/test_modeling_dpt.py b/tests/transformers/models/dpt/test_modeling_dpt.py
deleted file mode 100644
index 6244a5cf8..000000000
--- a/tests/transformers/models/dpt/test_modeling_dpt.py
+++ /dev/null
@@ -1,371 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore DPT model."""
-
-import numpy as np
-import unittest
-
-from mindnlp.transformers import DPTConfig
-from mindnlp.utils import is_mindspore_available, is_vision_available
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn
-
-    from mindnlp.transformers import DPTForDepthEstimation, DPTForSemanticSegmentation, DPTModel
-    from mindnlp.transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import DPTImageProcessor
-
-
-class DPTModelTester:
-    def __init__(
-            self,
-            parent,
-            batch_size=2,
-            image_size=32,
-            patch_size=16,
-            num_channels=3,
-            is_training=True,
-            use_labels=True,
-            hidden_size=32,
-            num_hidden_layers=2,
-            backbone_out_indices=None,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            initializer_range=0.02,
-            num_labels=3,
-            neck_hidden_sizes=None,
-            is_hybrid=False,
-            scope=None,
-    ):
-        if neck_hidden_sizes is None:
-            neck_hidden_sizes = [16, 32]
-        if backbone_out_indices is None:
-            backbone_out_indices = [0, 1, 2, 3]
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.backbone_out_indices = backbone_out_indices
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.scope = scope
-        self.is_hybrid = is_hybrid
-        self.neck_hidden_sizes = neck_hidden_sizes
-        # sequence length of DPT = num_patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return DPTConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            fusion_hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            backbone_out_indices=self.backbone_out_indices,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            is_hybrid=self.is_hybrid,
-            neck_hidden_sizes=self.neck_hidden_sizes,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = DPTModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = DPTForDepthEstimation(config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size))
-
-    def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = DPTForSemanticSegmentation(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size, self.image_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class DPTModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as DPT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-    DPTModel, DPTForDepthEstimation, DPTForSemanticSegmentation) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "depth-estimation": DPTForDepthEstimation,
-            "image-feature-extraction": DPTModel,
-            "image-segmentation": DPTForSemanticSegmentation,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = DPTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="DPT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="DPT does not use the nn.Embedding")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), nn.Module)
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_depth_estimation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs)
-
-    def test_for_semantic_segmentation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
-
-    def test_training(self):
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == "DPTForDepthEstimation":
-                continue
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            if model_class.__name__ in MODEL_MAPPING_NAMES.values():
-                continue
-
-            model = model_class(config)
-            model.set_train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            # loss.backward()
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, "
-               "check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, "
-               "check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            backbone_params = []
-            for name, module in model.cells_and_names():
-                if module.__class__.__name__ == "DPTViTHybridEmbeddings":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
-                    break
-
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    if name in backbone_params:
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    def test_backbone_selection(self):
-        def _validate_backbone_init():
-            for model_class in self.all_model_classes:
-                model = model_class(config)
-                model.set_train(False)
-
-                if model.__class__.__name__ == "DPTForDepthEstimation":
-                    # Confirm out_indices propogated to backbone
-                    self.assertEqual(len(model.backbone.out_indices), 2)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.use_pretrained_backbone = True
-        config.backbone_config = None
-        config.backbone_kwargs = {"out_indices": [-2, -1]}
-        # Force load_backbone path
-        config.is_hybrid = False
-
-        # # Load a timm backbone
-        # config.backbone = "resnet18"
-        # config.use_timm_backbone = True
-        # _validate_backbone_init()
-
-        # Load a HF backbone
-        config.backbone = "facebook/dinov2-small"
-        config.use_timm_backbone = False
-        _validate_backbone_init()
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Intel/dpt-large"
-        model = DPTModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-@slow
-class DPTModelIntegrationTest(unittest.TestCase):
-    def test_inference_depth_estimation(self):
-        image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large")
-        model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-        predicted_depth = outputs.predicted_depth
-
-        # verify the predicted depth
-        expected_shape = (1, 384, 384)
-        self.assertEqual(predicted_depth.shape, expected_shape)
-
-        expected_slice = mindspore.Tensor(
-            [[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]]
-        )
-
-        self.assertTrue(np.allclose(outputs.predicted_depth[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-    # def test_inference_semantic_segmentation(self):
-    #     image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large-ade")
-    #     model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")
-    #
-    #     image = prepare_img()
-    #     inputs = image_processor(images=image, return_tensors="ms")
-    #
-    #     # forward pass
-    #     outputs = model(**inputs)
-    #
-    #     # verify the logits
-    #     expected_shape = (1, 150, 480, 480)
-    #     self.assertEqual(outputs.logits.shape, expected_shape)
-    #
-    #     expected_slice = mindspore.Tensor(
-    #         [[4.0480, 4.2420, 4.4360], [4.3124, 4.5693, 4.8261], [4.5768, 4.8965, 5.2163]]
-    #     )
-    #
-    #     self.assertTrue(np.allclose(outputs.logits[0, 0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-    #
-    # def test_post_processing_semantic_segmentation(self):
-    #     image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large-ade")
-    #     model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")
-    #
-    #     image = prepare_img()
-    #     inputs = image_processor(images=image, return_tensors="ms")
-    #
-    #     # forward pass
-    #     outputs = model(**inputs)
-    #
-    #     # outputs.logits = outputs.logits.detach().cpu()
-    #
-    #     segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
-    #     expected_shape = (500, 300)
-    #     self.assertEqual(segmentation[0].shape, expected_shape)
-    #
-    #     segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
-    #     expected_shape = (480, 480)
-    #     self.assertEqual(segmentation[0].shape, expected_shape)
diff --git a/tests/transformers/models/dpt/test_modeling_dpt_auto_backbone.py b/tests/transformers/models/dpt/test_modeling_dpt_auto_backbone.py
deleted file mode 100644
index 56bc03404..000000000
--- a/tests/transformers/models/dpt/test_modeling_dpt_auto_backbone.py
+++ /dev/null
@@ -1,330 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore DPT model."""
-
-import unittest
-import numpy as np
-
-from mindnlp.transformers import Dinov2Config, DPTConfig
-from mindnlp.utils import is_mindspore_available, is_vision_available
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import DPTForDepthEstimation
-    from mindnlp.transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import DPTImageProcessor
-
-
-class DPTModelTester:
-    def __init__(
-            self,
-            parent,
-            batch_size=2,
-            num_channels=3,
-            image_size=32,
-            patch_size=16,
-            use_labels=True,
-            num_labels=3,
-            is_training=True,
-            hidden_size=4,
-            num_hidden_layers=2,
-            num_attention_heads=2,
-            intermediate_size=8,
-            out_features=None,
-            apply_layernorm=False,
-            reshape_hidden_states=False,
-            neck_hidden_sizes=None,
-            fusion_hidden_size=6,
-    ):
-        if neck_hidden_sizes is None:
-            neck_hidden_sizes = [2, 2]
-        if out_features is None:
-            out_features = ["stage1", "stage2"]
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.out_features = out_features
-        self.apply_layernorm = apply_layernorm
-        self.reshape_hidden_states = reshape_hidden_states
-        self.use_labels = use_labels
-        self.num_labels = num_labels
-        self.is_training = is_training
-        self.neck_hidden_sizes = neck_hidden_sizes
-        self.fusion_hidden_size = fusion_hidden_size
-        # DPT's sequence length
-        self.seq_length = (self.image_size // self.patch_size) ** 2 + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return DPTConfig(
-            backbone_config=self.get_backbone_config(),
-            backbone=None,
-            neck_hidden_sizes=self.neck_hidden_sizes,
-            fusion_hidden_size=self.fusion_hidden_size,
-        )
-
-    def get_backbone_config(self):
-        return Dinov2Config(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            is_training=self.is_training,
-            out_features=self.out_features,
-            reshape_hidden_states=self.reshape_hidden_states,
-        )
-
-    def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = DPTForDepthEstimation(config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class DPTModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some tests of test_modeling_common.py, as DPT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (DPTForDepthEstimation,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"depth-estimation": DPTForDepthEstimation} if is_mindspore_available() else {}
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = DPTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="DPT with AutoBackbone does not have a base model and hence no input_embeddings")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_for_depth_estimation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs)
-
-    def test_training(self):
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == "DPTForDepthEstimation":
-                continue
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            if model_class.__name__ in MODEL_MAPPING_NAMES.values():
-                continue
-
-            model = model_class(config)
-            model.set_train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            # loss.backward()
-
-    def test_training_gradient_checkpointing(self):
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == "DPTForDepthEstimation":
-                continue
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.use_cache = False
-            config.return_dict = True
-
-            if model_class.__name__ in MODEL_MAPPING_NAMES.values() or not model_class.supports_gradient_checkpointing:
-                continue
-            model = model_class(config)
-            model.gradient_checkpointing_enable()
-            model.set_train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            # loss.backward()
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            backbone_params = []
-            for name, module in model.cells_and_names():
-                if module.__class__.__name__ == "DPTViTHybridEmbeddings":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
-                    break
-
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    if name in backbone_params:
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @unittest.skip(reason="DPT with AutoBackbone does not have a base model and hence no input_embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="DPT with AutoBackbone does not have a base model and hence no input_embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="DPT with AutoBackbone does not have a base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="DPT with AutoBackbone does not have a base model")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, "
-               "check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, "
-               "check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Intel/dpt-large"
-        model = DPTForDepthEstimation.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-@slow
-class DPTModelIntegrationTest(unittest.TestCase):
-    @unittest.skip('The precision of dinov2 is incorrect due to ops.interpolate, which leads to the incorrect precision'
-                   'of this model using the dinov2 backbone. For the reason that all other tests have been passed, '
-                   'we can believe that the code of model has no problem.')
-    def test_inference_depth_estimation_dinov2(self):
-        image_processor = DPTImageProcessor.from_pretrained("facebook/dpt-dinov2-small-kitti")
-        model = DPTForDepthEstimation.from_pretrained("facebook/dpt-dinov2-small-kitti")
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-        predicted_depth = outputs.predicted_depth
-
-        # verify the predicted depth
-        expected_shape = (1, 576, 736)
-        self.assertEqual(predicted_depth.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[6.0433, 7.1636, 7.4268], [6.9047, 7.2471, 7.2355], [7.9261, 8.0631, 8.0244]]
-        )
-
-        self.assertTrue(np.allclose(outputs.predicted_depth[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-    def test_inference_depth_estimation_beit(self):
-        image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-beit-base-384")
-        model = DPTForDepthEstimation.from_pretrained("Intel/dpt-beit-base-384")
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-        predicted_depth = outputs.predicted_depth
-
-        # verify the predicted depth
-        expected_shape = (1, 384, 384)
-        self.assertEqual(predicted_depth.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[2669.7061, 2663.7144, 2674.9399], [2633.9326, 2650.9092, 2665.4270], [2621.8271, 2632.0129, 2637.2290]]
-        )
-
-        self.assertTrue(np.allclose(outputs.predicted_depth[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-    @unittest.skip('The swinv2 is absent in MindNLP when coding.')
-    def test_inference_depth_estimation_swinv2(self):
-        image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-swinv2-tiny-256")
-        model = DPTForDepthEstimation.from_pretrained("Intel/dpt-swinv2-tiny-256")
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-        predicted_depth = outputs.predicted_depth
-
-        # verify the predicted depth
-        expected_shape = (1, 256, 256)
-        self.assertEqual(predicted_depth.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[1032.7719, 1025.1886, 1030.2661], [1023.7619, 1021.0075, 1024.9121], [1022.5667, 1018.8522, 1021.4145]]
-        )
-
-        self.assertTrue(np.allclose(outputs.predicted_depth[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/dpt/test_modeling_dpt_hybrid.py b/tests/transformers/models/dpt/test_modeling_dpt_hybrid.py
deleted file mode 100644
index 912f55f1f..000000000
--- a/tests/transformers/models/dpt/test_modeling_dpt_hybrid.py
+++ /dev/null
@@ -1,417 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore DPT model."""
-
-import unittest
-import numpy as np
-
-from mindnlp.transformers import DPTConfig
-from mindnlp.utils import is_mindspore_available, is_vision_available
-from mindnlp.utils.testing_utils import is_flaky, require_mindspore, require_vision, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-    from mindnlp.core.nn import functional as F
-
-    from mindnlp.transformers import DPTForDepthEstimation, DPTForSemanticSegmentation, DPTModel
-    from mindnlp.transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import DPTImageProcessor
-
-
-class DPTModelTester:
-    def __init__(
-            self,
-            parent,
-            batch_size=2,
-            image_size=32,
-            patch_size=16,
-            num_channels=3,
-            is_training=True,
-            use_labels=True,
-            hidden_size=32,
-            num_hidden_layers=4,
-            backbone_out_indices=None,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            initializer_range=0.02,
-            num_labels=3,
-            backbone_featmap_shape=None,
-            neck_hidden_sizes=None,
-            is_hybrid=True,
-            scope=None,
-    ):
-        if neck_hidden_sizes is None:
-            neck_hidden_sizes = [16, 16, 32, 32]
-        if backbone_featmap_shape is None:
-            backbone_featmap_shape = [1, 32, 24, 24]
-        if backbone_out_indices is None:
-            backbone_out_indices = [0, 1, 2, 3]
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.backbone_out_indices = backbone_out_indices
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.backbone_featmap_shape = backbone_featmap_shape
-        self.scope = scope
-        self.is_hybrid = is_hybrid
-        self.neck_hidden_sizes = neck_hidden_sizes
-        # sequence length of DPT = num_patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        backbone_config = {
-            "global_padding": "same",
-            "layer_type": "bottleneck",
-            "depths": [3, 4, 9],
-            "out_features": ["stage1", "stage2", "stage3"],
-            "embedding_dynamic_padding": True,
-            "hidden_sizes": [16, 16, 32, 32],
-            "num_groups": 2,
-        }
-
-        return DPTConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            fusion_hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            backbone_out_indices=self.backbone_out_indices,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            is_hybrid=self.is_hybrid,
-            backbone_config=backbone_config,
-            backbone=None,
-            backbone_featmap_shape=self.backbone_featmap_shape,
-            neck_hidden_sizes=self.neck_hidden_sizes,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = DPTModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_depth_estimation(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = DPTForDepthEstimation(config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size))
-
-    def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = DPTForSemanticSegmentation(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size, self.image_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class DPTModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as DPT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        DPTModel, DPTForDepthEstimation, DPTForSemanticSegmentation) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "depth-estimation": DPTForDepthEstimation,
-            "feature-extraction": DPTModel,
-            "image-segmentation": DPTForSemanticSegmentation,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = DPTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DPTConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="DPT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="DPT does not use the nn.Embedding")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Dense))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_depth_estimation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs)
-
-    def test_for_semantic_segmentation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
-
-    def test_training(self):
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == "DPTForDepthEstimation":
-                continue
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            if model_class.__name__ in MODEL_MAPPING_NAMES.values():
-                continue
-
-            model = model_class(config)
-            model.set_train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            # loss.backward()
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, "
-               "check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, "
-               "check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            backbone_params = []
-            for name, module in model.named_modules():
-                if module.__class__.__name__ == "DPTViTHybridEmbeddings":
-                    backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()]
-                    break
-
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if name in backbone_params:
-                        continue
-                    self.assertIn(
-                        ((param.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Intel/dpt-hybrid-midas"
-        model = DPTModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_raise_readout_type(self):
-        # We do this test only for DPTForDepthEstimation since it is the only model that uses readout_type
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        config.readout_type = "add"
-        with self.assertRaises(ValueError):
-            _ = DPTForDepthEstimation(config)
-
-    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
-    def test_batching_equivalence(self):
-        def get_tensor_equivalence_function(batched_input):
-            # models operating on continuous spaces have higher abs difference than LMs
-            # instead, we can rely on cos distance for image/speech models, similar to `diffusers`
-            if "input_ids" not in batched_input:
-                return lambda tensor1, tensor2: (
-                        1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0,
-                                                    eps=1e-38)
-                )
-            return lambda tensor1, tensor2: ops.max(ops.abs(tensor1 - tensor2))
-
-        def recursive_check(batched_object, single_row_object, model_name, key):
-            if isinstance(batched_object, (list, tuple)):
-                for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
-                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
-            elif isinstance(batched_object, dict):
-                for batched_object_value, single_row_object_value in zip(
-                        batched_object.values(), single_row_object.values()
-                ):
-                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
-            # do not compare returned loss (0-dim tensor) / codebook ids (int) / caching objects
-            elif batched_object is None or not isinstance(batched_object, mindspore.Tensor):
-                return
-            elif batched_object.dim() == 0:
-                return
-            else:
-                # indexing the first element does not always work
-                # e.g. models that output similarity scores of size (N, M) would need to index [0, 0]
-                slice_ids = [slice(0, index) for index in single_row_object.shape]
-                batched_row = batched_object[slice_ids]
-                self.assertFalse(
-                    ops.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
-                )
-                self.assertTrue(
-                    (equivalence(batched_row, single_row_object)) <= 1e-03,
-                    msg=(
-                        f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
-                        f"Difference={equivalence(batched_row, single_row_object)}."
-                    ),
-                )
-
-        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
-        equivalence = get_tensor_equivalence_function(batched_input)
-
-        for model_class in self.all_model_classes:
-            config.output_hidden_states = True
-
-            model_name = model_class.__name__
-            if hasattr(self.model_tester, "prepare_config_and_inputs_for_model_class"):
-                config, batched_input = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
-            batched_input_prepared = self._prepare_for_class(batched_input, model_class)
-            model = model_class(config).set_train(False)
-
-            batch_size = self.model_tester.batch_size
-            single_row_input = {}
-            for key, value in batched_input_prepared.items():
-                if isinstance(value, mindspore.Tensor) and value.shape[0] % batch_size == 0:
-                    # e.g. musicgen has inputs of size (bs*codebooks). in most cases value.shape[0] == batch_size
-                    single_batch_shape = value.shape[0] // batch_size
-                    single_row_input[key] = value[:single_batch_shape]
-                else:
-                    single_row_input[key] = value
-
-            model_batched_output = model(**batched_input_prepared)
-            model_row_output = model(**single_row_input)
-
-            if isinstance(model_batched_output, mindspore.Tensor):
-                model_batched_output = {"model_output": model_batched_output}
-                model_row_output = {"model_output": model_row_output}
-
-            for key in model_batched_output:
-                # DETR starts from zero-init queries to decoder, leading to cos_similarity = `nan`
-                if hasattr(self, "zero_init_hidden_state") and "decoder_hidden_states" in key:
-                    model_batched_output[key] = model_batched_output[key][1:]
-                    model_row_output[key] = model_row_output[key][1:]
-                recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-@slow
-class DPTModelIntegrationTest(unittest.TestCase):
-    def test_inference_depth_estimation(self):
-        image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
-        model = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas")
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-        predicted_depth = outputs.predicted_depth
-
-        # verify the predicted depth
-        expected_shape = (1, 384, 384)
-        self.assertEqual(predicted_depth.shape, expected_shape)
-
-        expected_slice = mindspore.Tensor(
-            [[[5.6437, 5.6146, 5.6511], [5.4371, 5.5649, 5.5958], [5.5215, 5.5184, 5.5293]]]
-        )
-
-        self.assertTrue(
-            np.allclose((outputs.predicted_depth[:3, :3, :3] / 100).asnumpy(), expected_slice.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/efficientformer/__init__.py b/tests/transformers/models/efficientformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/efficientformer/test_image_processing_efficientformer.py b/tests/transformers/models/efficientformer/test_image_processing_efficientformer.py
deleted file mode 100644
index 16a3f2d64..000000000
--- a/tests/transformers/models/efficientformer/test_image_processing_efficientformer.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for EfficientFormerImageProcessor. """
-
-
-import unittest
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils.import_utils import is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_vision_available():
-    from mindnlp.transformers import ViTImageProcessor
-
-
-class EfficientFormerImageProcessorTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        num_channels=3,
-        image_size=224,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_image_processor_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.size["height"], self.size["width"]
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class EfficientFormerImageProcessorTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = ViTImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = EfficientFormerImageProcessorTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_proc_properties(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processor, "image_mean"))
-        self.assertTrue(hasattr(image_processor, "image_std"))
-        self.assertTrue(hasattr(image_processor, "do_normalize"))
-        self.assertTrue(hasattr(image_processor, "do_resize"))
-        self.assertTrue(hasattr(image_processor, "size"))
diff --git a/tests/transformers/models/efficientformer/test_modeling_efficientformer.py b/tests/transformers/models/efficientformer/test_modeling_efficientformer.py
deleted file mode 100644
index 4ef2a41c9..000000000
--- a/tests/transformers/models/efficientformer/test_modeling_efficientformer.py
+++ /dev/null
@@ -1,416 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch EfficientFormer model. """
-
-
-import unittest
-import warnings
-from typing import List
-import numpy as np
-
-from mindnlp.transformers import EfficientFormerConfig
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-    is_mindspore_available,
-)
-from mindnlp.utils import cached_property
-
-from mindnlp.utils.import_utils import is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.transformers.models.auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
-    from mindnlp.transformers import (
-        MODEL_MAPPING,
-        EfficientFormerForImageClassification,
-        EfficientFormerForImageClassificationWithTeacher,
-        EfficientFormerModel,
-    )
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import EfficientFormerImageProcessor
-
-class EfficientFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size: int = 13,
-        image_size: int = 64,
-        patch_size: int = 2,
-        embed_dim: int = 3,
-        num_channels: int = 3,
-        is_training: bool = True,
-        use_labels: bool = True,
-        hidden_size: int = 128,
-        hidden_sizes=[16, 32, 64, 128],
-        num_hidden_layers: int = 7,
-        num_attention_heads: int = 4,
-        intermediate_size: int = 37,
-        hidden_act: str = "gelu",
-        hidden_dropout_prob: float = 0.1,
-        attention_probs_dropout_prob: float = 0.1,
-        type_sequence_label_size: int = 10,
-        initializer_range: float = 0.02,
-        encoder_stride: int = 2,
-        num_attention_outputs: int = 1,
-        dim: int = 128,
-        depths: List[int] = [2, 2, 2, 2],
-        resolution: int = 2,
-        mlp_expansion_ratio: int = 2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.encoder_stride = encoder_stride
-        self.num_attention_outputs = num_attention_outputs
-        self.embed_dim = embed_dim
-        self.seq_length = embed_dim + 1
-        self.resolution = resolution
-        self.depths = depths
-        self.hidden_sizes = hidden_sizes
-        self.dim = dim
-        self.mlp_expansion_ratio = mlp_expansion_ratio
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return EfficientFormerConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-            resolution=self.resolution,
-            depths=self.depths,
-            hidden_sizes=self.hidden_sizes,
-            dim=self.dim,
-            mlp_expansion_ratio=self.mlp_expansion_ratio,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = EfficientFormerModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = EfficientFormerForImageClassification(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = EfficientFormerForImageClassification(config)
-        model.set_train(False)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class EfficientFormerModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as EfficientFormer does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            EfficientFormerModel,
-            # EfficientFormerForImageClassificationWithTeacher,
-            EfficientFormerForImageClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": EfficientFormerModel,
-            "image-classification": (
-                EfficientFormerForImageClassification,
-                EfficientFormerForImageClassificationWithTeacher,
-            ),
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = False
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = EfficientFormerModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=EfficientFormerConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="EfficientFormer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="EfficientFormer does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
-                    seq_length = seq_length * self.model_tester.chunk_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[-1].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[-1].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        return inputs_dict
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="EfficientFormer does not implement masked image modeling yet")
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    # special case for EfficientFormerForImageClassificationWithTeacher model
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            # EfficientFormerForImageClassificationWithTeacher supports inference-only
-            if (
-                model_class in get_values(MODEL_MAPPING)
-                or model_class.__name__ == "EfficientFormerForImageClassificationWithTeacher"
-            ):
-                continue
-            model = model_class(config)
-            model.set_train(False)
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            logits = model(**inputs).logits
-
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in EFFICIENTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = EfficientFormerModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_attention_outputs)
-
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class EfficientFormerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            EfficientFormerImageProcessor.from_pretrained("snap-research/efficientformer-l1-300")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = EfficientFormerForImageClassification.from_pretrained("snap-research/efficientformer-l1-300")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.Tensor([-0.0555, 0.4825, -0.0852])
-        self.assertTrue(np.allclose(outputs.logits[0][:3].asnumpy(), expected_slice.asnumpy(), atol=1e-3))
-
-    @slow
-    def test_inference_image_classification_head_with_teacher(self):
-        model = EfficientFormerForImageClassificationWithTeacher.from_pretrained(
-            "snap-research/efficientformer-l1-300"
-        )
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.Tensor([-0.1312, 0.4353, -1.0499])
-        self.assertTrue(np.allclose(outputs.logits[0][:3].asnumpy(), expected_slice.asnumpy(), atol=1e-3))
diff --git a/tests/transformers/models/electra/__init__.py b/tests/transformers/models/electra/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/electra/test_modeling_electra.py b/tests/transformers/models/electra/test_modeling_electra.py
deleted file mode 100644
index 93a4713f2..000000000
--- a/tests/transformers/models/electra/test_modeling_electra.py
+++ /dev/null
@@ -1,480 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from mindnlp.transformers import ElectraConfig
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        ElectraForCausalLM,
-        ElectraForMaskedLM,
-        ElectraForMultipleChoice,
-        ElectraForPreTraining,
-        ElectraForQuestionAnswering,
-        ElectraForSequenceClassification,
-        ElectraForTokenClassification,
-        ElectraModel,
-    )
-
-
-class ElectraModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-            fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            fake_token_labels,
-        )
-
-    def get_config(self):
-        return ElectraConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            _,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_electra_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        model = ElectraModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_electra_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = ElectraModel(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_electra_for_masked_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        model = ElectraForMaskedLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_electra_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = ElectraForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_electra_for_token_classification(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = ElectraForTokenClassification(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_electra_for_pretraining(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = ElectraForPreTraining(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_electra_for_sequence_classification(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = ElectraForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_electra_for_question_answering(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        model = ElectraForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_electra_for_multiple_choice(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        config.num_choices = self.num_choices
-        model = ElectraForMultipleChoice(config=config)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            fake_token_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            ElectraModel,
-            ElectraForPreTraining,
-            ElectraForMaskedLM,
-            ElectraForCausalLM,
-            ElectraForMultipleChoice,
-            ElectraForTokenClassification,
-            ElectraForSequenceClassification,
-            ElectraForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": ElectraModel,
-            "fill-mask": ElectraForMaskedLM,
-            "question-answering": ElectraForQuestionAnswering,
-            "text-classification": ElectraForSequenceClassification,
-            "text-generation": ElectraForCausalLM,
-            "token-classification": ElectraForTokenClassification,
-            "zero-shot": ElectraForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = True
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int64
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = ElectraModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_electra_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_electra_model(*config_and_inputs)
-
-    def test_electra_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_electra_model_as_decoder(*config_and_inputs)
-
-    def test_electra_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_electra_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_electra_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_electra_for_token_classification(*config_and_inputs)
-
-    def test_for_pre_training(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_electra_for_pretraining(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_electra_for_sequence_classification(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_electra_for_question_answering(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_electra_for_multiple_choice(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/electra-small-generator"
-        model = ElectraModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_electra_for_causal_lm(*config_and_inputs)
-
-
-@require_mindspore
-class ElectraModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        model = ElectraModel.from_pretrained("google/electra-small-discriminator")
-        input_ids = mindspore.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        attention_mask = mindspore.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = (1, 11, 256)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[[0.4471, 0.6821, -0.3265], [0.4627, 0.5255, -0.3668], [0.4532, 0.3313, -0.4344]]]
-        )
-
-        self.assertTrue(ops.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/encodec/__init__.py b/tests/transformers/models/encodec/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/encodec/test_modeling_encodec.py b/tests/transformers/models/encodec/test_modeling_encodec.py
deleted file mode 100644
index a18610474..000000000
--- a/tests/transformers/models/encodec/test_modeling_encodec.py
+++ /dev/null
@@ -1,539 +0,0 @@
-""" Testing suite for the MindSpore Encodec model. """
-
-import copy
-import inspect
-import unittest
-from typing import Dict, List, Tuple
-import numpy as np
-
-
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    require_mindspore,
-    slow,
-)
-from mindnlp.transformers import EncodecModel, EncodecConfig
-from mindnlp.transformers.models.encodec import EncodecFeatureExtractor
-from datasets import Audio, load_dataset
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-)
-
-
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-
-def prepare_inputs_dict(
-    config,
-    input_ids=None,
-    input_values=None,
-    decoder_input_ids=None,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    r"""
-    prepare inputs
-    """
-    if input_ids is not None:
-        encoder_dict = {"input_ids": input_ids}
-    else:
-        encoder_dict = {"input_values": input_values}
-
-    decoder_dict = {"decoder_input_ids": decoder_input_ids} if decoder_input_ids is not None else {}
-
-    return {**encoder_dict, **decoder_dict}
-
-
-@require_mindspore
-class EncodecModelTester:
-    r"""
-    EncodecModel Tester
-    """
-    def __init__(
-        self,
-        parent,
-        # `batch_size` needs to be an even number if the model has some outputs with batch dim != 0.
-        batch_size=12,
-        num_channels=2,
-        is_training=False,
-        intermediate_size=40,
-        hidden_size=32,
-        num_filters=8,
-        num_residual_layers=1,
-        upsampling_ratios=[8, 4],
-        num_lstm_layers=1,
-        codebook_size=64,
-    ):
-        r"""
-        Test EncodecModel
-        """
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.intermediate_size = intermediate_size
-        self.hidden_size = hidden_size
-        self.num_filters = num_filters
-        self.num_residual_layers = num_residual_layers
-        self.upsampling_ratios = upsampling_ratios
-        self.num_lstm_layers = num_lstm_layers
-        self.codebook_size = codebook_size
-
-    def prepare_config_and_inputs(self):
-        r"""
-        prepare config and inputs
-        """
-        input_values = floats_tensor([self.batch_size, self.num_channels, self.intermediate_size], scale=1.0)
-        config = self.get_config()
-        inputs_dict = {"input_values": input_values}
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        r"""
-        prepare config and inputs for common
-        """
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_config(self):
-        r"""
-        obtain EncodecModel Config
-        """
-        return EncodecConfig(
-            audio_channels=self.num_channels,
-            chunk_in_sec=None,
-            hidden_size=self.hidden_size,
-            num_filters=self.num_filters,
-            num_residual_layers=self.num_residual_layers,
-            upsampling_ratios=self.upsampling_ratios,
-            num_lstm_layers=self.num_lstm_layers,
-            codebook_size=self.codebook_size,
-        )
-
-    def create_and_check_model_forward(self, config, inputs_dict):
-        r"""
-        Encodec Model check method
-        """
-        model = EncodecModel(config=config).set_train(False)
-
-        input_values = inputs_dict["input_values"]
-        result = model(input_values)
-        self.parent.assertEqual(
-            result.audio_values.shape, (self.batch_size, self.num_channels, self.intermediate_size)
-        )
-
-
-@require_mindspore
-class EncodecModelTest(ModelTesterMixin, unittest.TestCase):
-    r"""
-    EncodecModel Test
-    """
-    all_model_classes = (EncodecModel,) if is_mindspore_available() else ()
-    is_encoder_decoder = True
-    test_pruning = False
-    test_headmasking = False
-    test_resize_embeddings = False
-    pipeline_model_mapping = {"feature-extraction": EncodecModel} if is_mindspore_available() else {}
-    input_name = "input_values"
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        # model does not have attention and does not support returning hidden states
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-        if "output_attentions" in inputs_dict:
-            inputs_dict.pop("output_attentions")
-        if "output_hidden_states" in inputs_dict:
-            inputs_dict.pop("output_hidden_states")
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = EncodecModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=EncodecConfig, hidden_size=37, common_properties=[], has_text_modality=False
-        )
-
-    def test_config(self):
-        r"""
-        test config
-        """
-        self.config_tester.run_common_tests()
-
-    def test_model_forward(self):
-        r"""
-        test_model_forward
-        """
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_forward(*config_and_inputs)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_values", "padding_mask", "bandwidth"]
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    @unittest.skip("The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip("The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `attention` logic")
-    def test_retain_grad_hidden_states_attentions(self):
-        r""" 
-        skip 
-        """
-
-    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `attention` logic")
-    def test_attention_outputs(self):
-        pass
-
-    def test_feed_forward_chunking(self):
-        (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            config.chunk_length_s = None
-            config.overlap = None
-            config.sampling_rate = 10
-
-            model = model_class(config)
-            model.set_train(False)
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            inputs["input_values"] = ops.tile(inputs["input_values"], (1, 1, 10))
-
-            hidden_states_no_chunk = model(**inputs)[0]
-
-            config.chunk_length_s = 1
-            config.overlap = 0
-            config.sampling_rate = 10
-
-            model = model_class(config)
-            model.set_train(False)
-
-            hidden_states_with_chunk = model(**inputs)[0]
-            # print(hidden_states_no_chunk)
-            self.assertTrue(np.allclose(hidden_states_no_chunk.asnumpy().astype(float), hidden_states_with_chunk.asnumpy().astype(float), atol=1e-3))
-
-    @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic")
-    def test_hidden_states_output(self):
-        pass
-
-    def test_determinism(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_determinism(first, second):
-            # outputs are not tensors but list (since each sequence don't have the same frame_length)
-            out_1 = first.asnumpy()
-            out_2 = second.asnumpy()
-            out_1 = out_1[~np.isnan(out_1)]
-            out_2 = out_2[~np.isnan(out_2)]
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-            first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-            second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            if isinstance(first, tuple) and isinstance(second, tuple):
-                for tensor1, tensor2 in zip(first, second):
-                    check_determinism(tensor1, tensor2)
-            else:
-                check_determinism(first, second)
-
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def set_nan_tensor_to_zero(t):
-            t[t != t] = 0
-            return t
-        #pylint: disable=W0102
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            r"""
-            check_equivalence
-            """
-            tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
-            dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs)
-
-            def recursive_check(tuple_object, dict_object):
-                if isinstance(tuple_object, (List, Tuple)):
-                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif isinstance(tuple_object, Dict):
-                    for tuple_iterable_value, dict_iterable_value in zip(
-                        tuple_object.values(), dict_object.values()
-                    ):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif tuple_object is None:
-                    return
-                else:
-                    self.assertTrue(
-                        np.allclose(
-                            set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
-                        ),
-                        msg=(
-                            "Tuple and dict output are not equal. Difference:"
-                            f" {ops.max(ops.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                            f" {ops.isnan(tuple_object).any()} and `inf`: {ops.isinf(tuple_object)}. Dict has"
-                            f" `nan`: {ops.isnan(dict_object).any()} and `inf`: {ops.isinf(dict_object)}."
-                        ),
-                    )
-
-                recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-    def test_initialization(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                uniform_init_parms = ["conv"]
-                ignore_init = ["lstm"]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif not any(x in name for x in ignore_init):
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def test_identity_shortcut(self):
-        r"""
-        test_identity_shortcut
-        """
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        config.use_conv_shortcut = False
-        self.model_tester.create_and_check_model_forward(config, inputs_dict)
-
-
-def normalize(arr):
-    r"""
-    norm method
-    """
-    norm = np.linalg.norm(arr)
-    normalized_arr = arr / norm
-    return normalized_arr
-
-
-def compute_rmse(arr1, arr2):
-    r"""
-    compute rmse
-    """
-    arr1_normalized = normalize(arr1)
-    arr2_normalized = normalize(arr2)
-    return np.sqrt(((arr1_normalized - arr2_normalized) ** 2).mean())
-
-
-@slow
-@require_mindspore
-class EncodecIntegrationTest(unittest.TestCase):
-    r"""
-    Test Encodec Integration
-    """
-    def test_integration_24kHz(self):
-        r"""
-        24KHz
-        """
-        expected_rmse = {
-            "1.5": 0.0025,
-            "24.0": 0.0015,
-        }
-        expected_codesums = {
-            "1.5": [371955],
-            "24.0": [6659962],
-        }
-        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        model_id = "facebook/encodec_24khz"
-
-        model = EncodecModel.from_pretrained(model_id)
-        processor = EncodecFeatureExtractor.from_pretrained(model_id)
-
-        librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
-        audio_sample = librispeech_dummy[-1]["audio"]["array"]
-
-        inputs = processor(
-            raw_audio=audio_sample,
-            sampling_rate=processor.sampling_rate,
-            return_tensors="ms",
-        )
-
-        for bandwidth, expected_rmse in expected_rmse.items():
-
-            # use max bandwith for best possible reconstruction
-            encoder_outputs = model.encode(inputs["input_values"], bandwidth=float(bandwidth))
-
-            audio_code_sums = [a[0].sum().item() for a in encoder_outputs[0]]
-
-            # make sure audio encoded codes are correct
-            self.assertListEqual(audio_code_sums, expected_codesums[bandwidth])
-
-            audio_codes, scales = encoder_outputs.to_tuple()
-            input_values_dec = model.decode(audio_codes, scales, inputs["padding_mask"])[0]
-            input_values_enc_dec = model(
-                inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth)
-            )[-1]
-
-            # make sure forward and decode gives same result
-            self.assertTrue(np.allclose(input_values_dec.asnumpy(), input_values_enc_dec.asnumpy(), atol=1e-3))
-
-            # make sure shape matches
-            self.assertTrue(inputs["input_values"].shape == input_values_enc_dec.shape)
-
-            arr = inputs["input_values"][0].asnumpy()
-            arr_enc_dec = input_values_enc_dec[0].asnumpy()
-
-            # make sure audios are more or less equal
-            # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
-            rmse = compute_rmse(arr, arr_enc_dec)
-            print(rmse, expected_rmse)
-            self.assertTrue(rmse < expected_rmse)
-
-    def test_integration_48kHz(self):
-        r"""
-        48KHz test
-        """
-        expected_rmse = {
-            "3.0": 0.001,
-            "24.0": 0.0005,
-        }
-        expected_codesums = {
-            "3.0": [144259, 146765, 156435, 176871, 161971],
-            "24.0": [1568553, 1294948, 1306190, 1464747, 1663150],
-        }
-        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        model_id = "facebook/encodec_48khz"
-
-        model = EncodecModel.from_pretrained(model_id)
-        model.set_train(False)
-        processor = EncodecFeatureExtractor.from_pretrained(model_id)
-
-        librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
-        audio_sample = librispeech_dummy[-1]["audio"]["array"]
-
-        # transform mono to stereo
-        audio_sample = np.array([audio_sample, audio_sample])
-
-        inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="ms")
-
-        for bandwidth, expected_rmse in expected_rmse.items():
-
-            # use max bandwith for best possible reconstruction
-            encoder_outputs = model.encode(
-                inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth), return_dict=False
-            )
-            audio_code_sums = [a[0].sum().item() for a in encoder_outputs[0]]
-
-            # make sure audio encoded codes are correct
-            self.assertListEqual(audio_code_sums, expected_codesums[bandwidth])
-            audio_codes, scales = encoder_outputs
-            input_values_dec = model.decode(audio_codes, scales, inputs["padding_mask"])[0]
-            input_values_enc_dec = model(
-                inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth)
-            )[-1]
-
-            # make sure forward and decode gives same result
-            self.assertTrue(np.allclose(input_values_dec.asnumpy(), input_values_enc_dec.asnumpy(), atol=1e-3))
-
-            # make sure shape matches
-            self.assertTrue(inputs["input_values"].shape == input_values_enc_dec.shape)
-
-            arr = inputs["input_values"][0].asnumpy()
-            arr_enc_dec = input_values_enc_dec[0].asnumpy()
-
-            # make sure audios are more or less equal
-            # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
-            rmse = compute_rmse(arr, arr_enc_dec)
-            self.assertTrue(rmse < expected_rmse)
-
-    def test_batch_48kHz(self):
-        r"""
-        batch_48khz
-        """
-        expected_rmse = {
-            "3.0": 0.001,
-            "24.0": 0.0005,
-        }
-        expected_codesums = {
-            "3.0": [
-                [72410, 79137, 76694, 90854, 73023, 82980, 72707, 54842],
-                [85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241],
-            ],
-            "24.0": [
-                [72410, 79137, 76694, 90854, 73023, 82980, 72707, 54842],
-                [85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241],
-            ],
-        }
-        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        model_id = "facebook/encodec_48khz"
-
-        model = EncodecModel.from_pretrained(model_id)
-        processor = EncodecFeatureExtractor.from_pretrained(model_id, chunk_length_s=1, overlap=0.01)
-
-        librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
-
-        audio_samples = [
-            np.array([audio_sample["array"], audio_sample["array"]])
-            for audio_sample in librispeech_dummy[-2:]["audio"]
-        ]
-
-        inputs = processor(raw_audio=audio_samples, sampling_rate=processor.sampling_rate, return_tensors="ms")
-        input_values = inputs["input_values"]
-        for bandwidth, expected_rmse in expected_rmse.items():
-            # use max bandwith for best possible reconstruction
-            encoder_outputs = model.encode(input_values, bandwidth=float(bandwidth), return_dict=False)
-            audio_code_sums_0 = [a[0][0].sum().item() for a in encoder_outputs[0]]
-            audio_code_sums_1 = [a[0][1].sum().item() for a in encoder_outputs[0]]
-
-            # make sure audio encoded codes are correct
-            self.assertListEqual(audio_code_sums_0, expected_codesums[bandwidth][0])
-            self.assertListEqual(audio_code_sums_1, expected_codesums[bandwidth][1])
-
-            audio_codes, scales = encoder_outputs
-            input_values_dec = model.decode(audio_codes, scales)[0]
-            input_values_enc_dec = model(input_values, bandwidth=float(bandwidth))[-1]
-
-            # make sure forward and decode gives same result
-            self.assertTrue(np.allclose(input_values_dec, input_values_enc_dec, atol=1e-3))
-
-            # make sure shape matches
-            self.assertTrue(input_values.shape == input_values_enc_dec.shape)
-
-            arr = input_values[0].asnumpy()
-            arr_enc_dec = input_values_enc_dec[0].asnumpy()
-
-            # make sure audios are more or less equal
-            # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
-            rmse = compute_rmse(arr, arr_enc_dec)
-            self.assertTrue(rmse < expected_rmse)
diff --git a/tests/transformers/models/ernie/__init__.py b/tests/transformers/models/ernie/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/ernie/test_modeling_ernie.py b/tests/transformers/models/ernie/test_modeling_ernie.py
deleted file mode 100644
index 0e2a16b79..000000000
--- a/tests/transformers/models/ernie/test_modeling_ernie.py
+++ /dev/null
@@ -1,572 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import tempfile
-import unittest
-import numpy as np
-
-from mindnlp.transformers import ErnieConfig
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import slow, require_mindspore, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        ErnieForCausalLM,
-        ErnieForMaskedLM,
-        ErnieForMultipleChoice,
-        ErnieForNextSentencePrediction,
-        ErnieForPreTraining,
-        ErnieForQuestionAnswering,
-        ErnieForSequenceClassification,
-        ErnieForTokenClassification,
-        ErnieModel,
-    )
-    from mindnlp.transformers.models.ernie.modeling_ernie import ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
-class ErnieModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        """
-        Returns a tiny configuration by default.
-        """
-        return ErnieConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ErnieModel(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = ErnieModel(config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = ErnieForCausalLM(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ErnieForMaskedLM(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_model_for_causal_lm_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = ErnieForCausalLM(config=config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = ErnieForCausalLM(config=config).set_train(False)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_for_next_sequence_prediction(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ErnieForNextSentencePrediction(config=config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_pretraining(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ErnieForPreTraining(config=config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            next_sentence_label=sequence_labels,
-        )
-        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ErnieForQuestionAnswering(config=config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = ErnieForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = ErnieForTokenClassification(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = ErnieForMultipleChoice(config=config)
-
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ErnieModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            ErnieForCausalLM,
-            ErnieForMaskedLM,
-            ErnieForMultipleChoice,
-            ErnieForNextSentencePrediction,
-            ErnieForPreTraining,
-            ErnieForQuestionAnswering,
-            ErnieForSequenceClassification,
-            ErnieForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (ErnieForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": ErnieModel,
-            "fill-mask": ErnieForMaskedLM,
-            "question-answering": ErnieForQuestionAnswering,
-            "text-classification": ErnieForSequenceClassification,
-            "text-generation": ErnieForCausalLM,
-            "token-classification": ErnieForTokenClassification,
-            "zero-shot": ErnieForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = False
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int64)
-                inputs_dict["next_sentence_label"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64)
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = ErnieModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ErnieConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_causal_lm_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_for_causal_lm_as_decoder(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs_relative_pos_emb(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        config_and_inputs[0].position_embedding_type = "relative_key"
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_next_sequence_prediction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_next_sequence_prediction(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = ErnieModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
diff --git a/tests/transformers/models/ernie_m/__init__.py b/tests/transformers/models/ernie_m/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/ernie_m/test_modeling_ernie_m.py b/tests/transformers/models/ernie_m/test_modeling_ernie_m.py
deleted file mode 100644
index 0e737f8af..000000000
--- a/tests/transformers/models/ernie_m/test_modeling_ernie_m.py
+++ /dev/null
@@ -1,322 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. and Baidu team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch ErnieM model. """
-
-
-import unittest
-import numpy as np
-
-from mindnlp.transformers import ErnieMConfig
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        ErnieMForInformationExtraction,
-        ErnieMForMultipleChoice,
-        ErnieMForQuestionAnswering,
-        ErnieMForSequenceClassification,
-        ErnieMForTokenClassification,
-        ErnieMModel,
-    )
-    from mindnlp.transformers.models.ernie_m.modeling_ernie_m import ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST
-
-class ErnieMModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def prepare_config_and_inputs_for_uiem(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return ErnieMConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
-        model = ErnieMModel(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, return_dict=True)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ErnieMForQuestionAnswering(config=config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_information_extraction(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = ErnieMForInformationExtraction(config=config)
-
-        model.set_train(False)
-        sequence_labels = ops.ones_like(input_ids, dtype=mindspore.float32)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = ErnieMForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = ErnieMForTokenClassification(config=config)
-
-        model.set_train(False)
-
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = ErnieMForMultipleChoice(config=config)
-
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ErnieMModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            ErnieMModel,
-            ErnieMForMultipleChoice,
-            ErnieMForQuestionAnswering,
-            ErnieMForSequenceClassification,
-            ErnieMForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": ErnieMModel,
-            "question-answering": ErnieMForQuestionAnswering,
-            "text-classification": ErnieMForSequenceClassification,
-            "token-classification": ErnieMForTokenClassification,
-            "zero-shot": ErnieMForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_torchscript = False
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "QAPipelineTests":
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = ErnieMModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ErnieMConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_information_extraction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_information_extraction(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = ErnieMModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-@require_mindspore
-class ErnieMModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_model(self):
-        model = ErnieMModel.from_pretrained("susnato/ernie-m-base_pytorch")
-        model.set_train(False)
-        input_ids = mindspore.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        # TODO Replace vocab size
-        hidden_size = 768
-
-        expected_shape = (1, 6, hidden_size)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[[0.11525417, 0.32232907, -0.14834584], [0.09505975, 0.11805594, 0.06016424], [0.03162006, 0.02301355, 0.06967615]]]
-        )
-        self.assertTrue(np.allclose(output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-3))
diff --git a/tests/transformers/models/esm/__init__.py b/tests/transformers/models/esm/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/esm/test_modeling_esm.py b/tests/transformers/models/esm/test_modeling_esm.py
deleted file mode 100644
index 26c1caed0..000000000
--- a/tests/transformers/models/esm/test_modeling_esm.py
+++ /dev/null
@@ -1,335 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch ESM model. """
-
-
-import unittest
-import numpy as np
-
-from mindnlp.utils import is_mindspore_available
-from mindnlp.transformers import EsmConfig
-from mindnlp.utils.testing_utils import TestCasePlus, require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import EsmForMaskedLM, EsmForSequenceClassification, EsmForTokenClassification, EsmModel
-    from mindnlp.transformers.models.esm.modeling_esm import (
-        ESM_PRETRAINED_MODEL_ARCHIVE_LIST,
-        EsmEmbeddings,
-        create_position_ids_from_input_ids,
-    )
-
-
-# copied from tests.test_modeling_roberta
-class EsmModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=False,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=33,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return EsmConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            pad_token_id=1,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
-        model = EsmModel(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = EsmForMaskedLM(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = EsmForTokenClassification(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_forward_and_backwards(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        gradient_checkpointing=False,
-    ):
-        model = EsmForMaskedLM(config)
-        if gradient_checkpointing:
-            model.gradient_checkpointing_enable()
-
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        result.loss.backward()
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class EsmModelTest(ModelTesterMixin, unittest.TestCase):
-    test_mismatched_shapes = False
-
-    all_model_classes = (
-        (
-            EsmForMaskedLM,
-            EsmModel,
-            EsmForSequenceClassification,
-            EsmForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": EsmModel,
-            "fill-mask": EsmForMaskedLM,
-            "text-classification": EsmForSequenceClassification,
-            "token-classification": EsmForTokenClassification,
-            "zero-shot": EsmForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_sequence_classification_problem_types = True
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = EsmModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=EsmConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ESM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = EsmModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_create_position_ids_respects_padding_index(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is EsmEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = EsmEmbeddings(config=config)
-
-        input_ids = mindspore.tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = mindspore.tensor(
-            [
-                [
-                    0 + model.padding_idx + 1,
-                    1 + model.padding_idx + 1,
-                    2 + model.padding_idx + 1,
-                    model.padding_idx,
-                ]
-            ]
-        )
-        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(ops.all(ops.eq(position_ids, expected_positions)))
-
-    def test_create_position_ids_from_inputs_embeds(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is EsmEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = EsmEmbeddings(config=config)
-
-        inputs_embeds = ops.randn(2, 4, 30)
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = mindspore.tensor([expected_single_positions, expected_single_positions])
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(ops.all(ops.eq(position_ids, expected_positions)))
-
-    @unittest.skip("Esm does not support embedding resizing")
-    def test_resize_embeddings_untied(self):
-        pass
-
-    @unittest.skip("Esm does not support embedding resizing")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-
-@slow
-@require_mindspore
-class EsmModelIntegrationTest(TestCasePlus):
-    def test_inference_masked_lm(self):
-        model = EsmForMaskedLM.from_pretrained("facebook/esm2_t6_8M_UR50D")
-        model.set_train(False)
-        input_ids = mindspore.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        vocab_size = 33
-
-        expected_shape = (1, 6, vocab_size)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[[8.9215, -10.5898, -6.4671], [-6.3967, -13.9114, -1.1212], [-7.7812, -13.9516, -3.7406]]]
-        )
-        print(output[:, :3, :3])
-        self.assertTrue(np.allclose(output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-3))
-
-    def test_inference_no_head(self):
-        model = EsmModel.from_pretrained("facebook/esm2_t6_8M_UR50D")
-        model.set_train(False)
-
-        input_ids = mindspore.tensor([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]])
-        output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = mindspore.tensor(
-            [[[0.1444, 0.5413, 0.3248], [0.3034, 0.0053, 0.3108], [0.3228, -0.2499, 0.3415]]]
-        )
-        self.assertTrue(np.allclose(output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/esm/test_modeling_esmfold.py b/tests/transformers/models/esm/test_modeling_esmfold.py
deleted file mode 100644
index 377856747..000000000
--- a/tests/transformers/models/esm/test_modeling_esmfold.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch ESM model. """
-
-
-import unittest
-import numpy as np
-
-from mindnlp.utils import is_mindspore_available
-from mindnlp.transformers import EsmConfig
-from mindnlp.utils.testing_utils import TestCasePlus, require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers.models.esm.modeling_esmfold import EsmForProteinFolding
-
-class EsmFoldModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=False,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=False,
-        vocab_size=19,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        esmfold_config = {
-            "trunk": {
-                "num_blocks": 2,
-                "sequence_state_dim": 64,
-                "pairwise_state_dim": 16,
-                "sequence_head_width": 4,
-                "pairwise_head_width": 4,
-                "position_bins": 4,
-                "chunk_size": 16,
-                "structure_module": {
-                    "ipa_dim": 16,
-                    "num_angles": 7,
-                    "num_blocks": 2,
-                    "num_heads_ipa": 4,
-                    "pairwise_dim": 16,
-                    "resnet_dim": 16,
-                    "sequence_dim": 48,
-                },
-            },
-            "fp16_esm": False,
-            "lddt_head_hid_dim": 16,
-        }
-        config = EsmConfig(
-            vocab_size=33,
-            hidden_size=self.hidden_size,
-            pad_token_id=1,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            is_folding_model=True,
-            esmfold_config=esmfold_config,
-        )
-        return config
-
-    def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
-        model = EsmForProteinFolding(config=config).float()
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.positions.shape, (2, self.batch_size, self.seq_length, 14, 3))
-        self.parent.assertEqual(result.angles.shape, (2, self.batch_size, self.seq_length, 7, 2))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class EsmFoldModelTest(ModelTesterMixin, unittest.TestCase):
-    test_mismatched_shapes = False
-
-    all_model_classes = (EsmForProteinFolding,) if is_mindspore_available() else ()
-    all_generative_model_classes = ()
-    pipeline_model_mapping = {} if is_mindspore_available() else {}
-    test_sequence_classification_problem_types = False
-
-    def setUp(self):
-        self.model_tester = EsmFoldModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=EsmConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip("Does not support attention outputs")
-    def test_attention_outputs(self):
-        pass
-
-    @unittest.skip
-    def test_correct_missing_keys(self):
-        pass
-
-    @unittest.skip
-    def test_determinism(self):
-        pass
-
-    @unittest.skip("Esm does not support embedding resizing")
-    def test_resize_embeddings_untied(self):
-        pass
-
-    @unittest.skip("Esm does not support embedding resizing")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip("ESMFold does not support passing input embeds!")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip("ESMFold does not support head pruning.")
-    def test_head_pruning(self):
-        pass
-
-    @unittest.skip("ESMFold does not support head pruning.")
-    def test_head_pruning_integration(self):
-        pass
-
-    @unittest.skip("ESMFold does not support head pruning.")
-    def test_head_pruning_save_load_from_config_init(self):
-        pass
-
-    @unittest.skip("ESMFold does not support head pruning.")
-    def test_head_pruning_save_load_from_pretrained(self):
-        pass
-
-    @unittest.skip("ESMFold does not support head pruning.")
-    def test_headmasking(self):
-        pass
-
-    @unittest.skip("ESMFold does not output hidden states in the normal way.")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip("ESMfold does not output hidden states in the normal way.")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip("ESMFold only has one output format.")
-    def test_model_outputs_equivalence(self):
-        pass
-
-    @unittest.skip("This test doesn't work for ESMFold and doesn't test core functionality")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip("ESMFold does not support input chunking.")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip("ESMFold doesn't respect you and it certainly doesn't respect your initialization arguments.")
-    def test_initialization(self):
-        pass
-
-    @unittest.skip("ESMFold doesn't support torchscript compilation.")
-    def test_torchscript_output_attentions(self):
-        pass
-
-    @unittest.skip("ESMFold doesn't support torchscript compilation.")
-    def test_torchscript_output_hidden_state(self):
-        pass
-
-    @unittest.skip("ESMFold doesn't support torchscript compilation.")
-    def test_torchscript_simple(self):
-        pass
-
-    @unittest.skip("ESMFold doesn't support data parallel.")
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-
-@require_mindspore
-class EsmModelIntegrationTest(TestCasePlus):
-    @slow
-    def test_inference_protein_folding(self):
-        model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1").float()
-        model.set_train(False)
-        input_ids = mindspore.tensor([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]])
-        position_outputs = model(input_ids)["positions"]
-        expected_slice = mindspore.tensor([2.5828, 0.7993, -10.9334], dtype=mindspore.float32)
-        print(position_outputs[0, 0, 0, 0])
-        self.assertTrue(np.allclose(position_outputs[0, 0, 0, 0].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/falcon/__init__.py b/tests/transformers/models/falcon/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/falcon/test_modeling_falcon.py b/tests/transformers/models/falcon/test_modeling_falcon.py
deleted file mode 100644
index 12e71f594..000000000
--- a/tests/transformers/models/falcon/test_modeling_falcon.py
+++ /dev/null
@@ -1,609 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-""" Testing suite for the MindSpore Falcon model. """
-
-# pylint: disable=W0613
-
-import unittest
-import numpy as np
-
-from parameterized import parameterized
-
-from mindspore import set_seed
-from mindnlp.transformers import AutoTokenizer, AutoModelForCausalLM
-from mindnlp.utils.testing_utils import is_mindspore_available, slow, require_mindspore
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers.models.falcon import (
-        FalconConfig,
-        FalconForCausalLM,
-        FalconForQuestionAnswering,
-        FalconForSequenceClassification,
-        FalconForTokenClassification,
-        FalconModel,
-    )
-
-class FalconModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor(
-                [self.batch_size], self.type_sequence_label_size
-            )
-            token_labels = ids_tensor(
-                [self.batch_size, self.seq_length], self.num_labels
-            )
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return FalconConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=1,
-            new_decoder_architecture=True,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = FalconModel(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = FalconModel(config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = FalconForCausalLM(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)
-        )
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = FalconForCausalLM(config=config)
-
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[
-            :, -3:, random_slice_idx]
-            # :, -3:, random_slice_idx].detach()
-        # output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(
-            np.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class FalconModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            # FalconModel,
-            FalconForCausalLM,
-            FalconForSequenceClassification,
-            FalconForTokenClassification,
-            FalconForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (FalconForCausalLM,) if is_mindspore_available() else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": FalconModel,
-            "question-answering": FalconForQuestionAnswering,
-            "text-classification": FalconForSequenceClassification,
-            "text-generation": FalconForCausalLM,
-            "token-classification": FalconForTokenClassification,
-            "zero-shot": FalconForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_casse_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        processor_name,
-    ):
-        return True
-
-    def setUp(self):
-        self.model_tester = FalconModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=FalconConfig, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_position_embedding_types(self):
-        config, *inputs = self.model_tester.prepare_config_and_inputs()
-        for alibi in [True, False]:
-            config.alibi = alibi
-            self.model_tester.create_and_check_model(config, *inputs)
-
-    def test_falcon_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size], self.model_tester.type_sequence_label_size
-        )
-        model = FalconForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(
-            result.logits.shape,
-            (self.model_tester.batch_size, self.model_tester.num_labels),
-        )
-
-    def test_falcon_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size], self.model_tester.type_sequence_label_size
-        )
-        model = FalconForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(
-            result.logits.shape,
-            (self.model_tester.batch_size, self.model_tester.num_labels),
-        )
-
-    def test_falcon_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels],
-            self.model_tester.type_sequence_label_size,
-        ).astype(mindspore.float32)
-        model = FalconForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(
-            result.logits.shape,
-            (self.model_tester.batch_size, self.model_tester.num_labels),
-        )
-
-    def test_past_key_values_format(self):
-        # Falcon can have different numbers of KV-heads than the number of query heads, so we need
-        # to override this test to use the right head counts.
-        for model_class in self.all_generative_model_classes:
-            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-
-            # If it doesn't support cache, pass the test
-            if not hasattr(config, "use_cache"):
-                return
-
-            model = model_class(config)
-            if "use_cache" not in inputs:
-                inputs["use_cache"] = True
-            outputs = model(**inputs)
-
-            # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format)
-            if "past_key_values" not in outputs:
-                return
-
-            num_hidden_layers = (
-                getattr(config, "decoder_layers", None)
-                or getattr(config, "num_decoder_layers", None)
-                or config.num_hidden_layers
-            )
-            num_attention_heads = getattr(
-                config, "num_kv_heads", config.num_attention_heads
-            )
-            embed_dim = getattr(config, "d_model", config.hidden_size)
-            per_head_embed_dim = embed_dim // num_attention_heads
-
-            past_kv = outputs["past_key_values"]
-            self.assertEqual(len(past_kv), num_hidden_layers)
-
-            batch_size, seq_length = inputs["input_ids"].shape
-            for i in range(num_hidden_layers):
-                if config.new_decoder_architecture:
-                    num_attention_heads = config.num_attention_heads
-                elif config.multi_query:
-                    num_attention_heads = 1
-                self.assertEqual(len(past_kv[0]), 2)  # K V for the decoder = 2
-                self.assertEqual(
-                    past_kv[i][0].shape,
-                    (batch_size, num_attention_heads, seq_length, per_head_embed_dim),
-                )
-                self.assertEqual(
-                    past_kv[i][1].shape,
-                    (batch_size, num_attention_heads, seq_length, per_head_embed_dim),
-                )
-
-    @parameterized.expand([("linear",), ("dynamic",)])
-    @unittest.skip("need to update the test case")
-    def test_model_rope_scaling(self, scaling_type):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        short_input = ids_tensor([1, 10], config.vocab_size)
-        long_input = ids_tensor(
-            [1, int(config.max_position_embeddings * 1.5)], config.vocab_size
-        )
-
-        set_seed(
-            42
-        )  # Fixed seed at init time so the two models get the same random weights
-        original_model = FalconModel(config)
-        original_model.set_train(False)
-        original_short_output = original_model(short_input).last_hidden_state
-        original_long_output = original_model(long_input).last_hidden_state
-
-        set_seed(
-            42
-        )  # Fixed seed at init time so the two models get the same random weights
-        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
-        scaled_model = FalconModel(config)
-        scaled_model.set_train(False)
-        scaled_short_output = scaled_model(short_input).last_hidden_state
-        scaled_long_output = scaled_model(long_input).last_hidden_state
-
-        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
-        # maximum sequence length, so the outputs for the short input should match.
-        if scaling_type == "dynamic":
-            self.assertTrue(
-                np.allclose(
-                    original_short_output.asnumpy(),
-                    scaled_short_output.asnumpy(),
-                    atol=1e-5,
-                )
-            )
-        else:
-            self.assertFalse(
-                np.allclose(
-                    original_short_output.asnumpy(),
-                    scaled_short_output.asnumpy(),
-                    atol=1e-5,
-                )
-            )
-
-        # The output should be different for long inputs
-        self.assertFalse(
-            np.allclose(
-                original_long_output.asnumpy(), scaled_long_output.asnumpy(), atol=1e-5
-            )
-        )
-
-
-@require_mindspore
-class FalconLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_falcon(self):
-        tokenizer = AutoTokenizer.from_pretrained("Rocketknight1/falcon-rw-1b")
-        model = FalconForCausalLM.from_pretrained("Rocketknight1/falcon-rw-1b")
-        model.set_train(False)
-        inputs = tokenizer("My favorite food is", return_tensors="ms")
-
-        EXPECTED_OUTPUT = "My favorite food is pizza. I love it so much that I have a pizza party every year for my birthday."
-
-        output_ids = model.generate(**inputs, do_sample=False, max_new_tokens=19)
-        output_str = tokenizer.batch_decode(output_ids)[0]
-
-        self.assertEqual(output_str, EXPECTED_OUTPUT)
-
-    @slow
-    def test_lm_generation_big_models(self):
-        # The big models are way too big for the CI, so we use tiny random models that resemble their
-        # architectures but with much smaller and fewer layers
-        for repo in [
-            "Rocketknight1/tiny-random-falcon-7b",  
-            "Rocketknight1/tiny-random-falcon-40b",
-        ]:
-            tokenizer = AutoTokenizer.from_pretrained(repo)
-            model = FalconForCausalLM.from_pretrained(repo)
-            model.set_train(False)
-
-            inputs = tokenizer("My favorite food is", return_tensors="ms")
-
-            # We just test that these run without errors - the models are randomly initialized
-            # and so the actual text outputs will be garbage
-            model.generate(**inputs, do_sample=False, max_new_tokens=4)
-            model.generate(**inputs, do_sample=True, max_new_tokens=4)
-            model.generate(**inputs, num_beams=2, max_new_tokens=4)
-
-    @slow
-    def test_lm_generation_use_cache(self):
-        # The big models are way too big for the CI, so we use tiny random models that resemble their
-        # architectures but with much smaller and fewer layers
-        for repo in [
-            "Rocketknight1/falcon-rw-1b",
-            "Rocketknight1/tiny-random-falcon-7b",
-            "Rocketknight1/tiny-random-falcon-40b",
-        ]:
-            tokenizer = AutoTokenizer.from_pretrained(repo)
-            model = FalconForCausalLM.from_pretrained(repo)
-            model.set_train(False)
-            inputs = tokenizer("My favorite food is", return_tensors="ms")
-
-            # Test results are the same with and without cache
-            outputs_no_cache = model.generate(
-                **inputs, do_sample=False, max_new_tokens=20, use_cache=False
-            )
-            outputs_cache = model.generate(
-                **inputs, do_sample=False, max_new_tokens=20, use_cache=True
-            )
-            self.assertTrue((outputs_cache - outputs_no_cache).sum().item() == 0)
-
-    @slow
-    def test_batched_generation(self):
-        tokenizer = AutoTokenizer.from_pretrained(
-            "tiiuae/falcon-7b", padding_side="left"
-        )
-        tokenizer.pad_token = tokenizer.eos_token
-        model = AutoModelForCausalLM.from_pretrained(
-            "tiiuae/falcon-7b",
-        )
-
-        test_text = "A sequence: 1, 2"  # should generate the rest of the sequence
-
-        unpadded_inputs = tokenizer([test_text], return_tensors="ms")
-        unpadded_gen_out = model.generate(**unpadded_inputs, max_new_tokens=20)
-        unpadded_gen_text = tokenizer.batch_decode(
-            unpadded_gen_out, skip_special_tokens=True
-        )
-
-        dummy_text = "This is a longer text " * 2  # forces left-padding on `test_text`
-        padded_inputs = tokenizer(
-            [test_text, dummy_text], return_tensors="ms", padding=True
-        )
-        padded_gen_out = model.generate(**padded_inputs, max_new_tokens=20)
-        padded_gen_text = tokenizer.batch_decode(
-            padded_gen_out, skip_special_tokens=True
-        )
-
-        expected_output = "A sequence: 1, 2, 3, 4, 5, 6, 7, 8, "
-        self.assertLess(
-            unpadded_inputs.input_ids.shape[-1], padded_inputs.input_ids.shape[-1]
-        )  # left-padding exists
-        self.assertEqual(unpadded_gen_text[0], expected_output)
-        self.assertEqual(padded_gen_text[0], expected_output)
diff --git a/tests/transformers/models/fastspeech2_conformer/__init__.py b/tests/transformers/models/fastspeech2_conformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py b/tests/transformers/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
deleted file mode 100644
index eb49b466c..000000000
--- a/tests/transformers/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
+++ /dev/null
@@ -1,781 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore FastSpeech2Conformer model."""
-# pylint: disable=line-too-long
-
-import inspect
-import tempfile
-import unittest
-
-import numpy as np
-from mindspore import ops
-from mindnlp.transformers import (
-    FastSpeech2ConformerConfig,
-    FastSpeech2ConformerHifiGanConfig,
-    FastSpeech2ConformerTokenizer,
-    FastSpeech2ConformerWithHifiGanConfig,
-)
-from mindnlp.utils.testing_utils import require_g2p_en, is_mindspore_available, require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import FastSpeech2ConformerModel, FastSpeech2ConformerWithHifiGan
-    from mindspore import set_seed
-
-class FastSpeech2ConformerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        num_hidden_layers=1,
-        num_attention_heads=2,
-        hidden_size=24,
-        seq_length=7,
-        encoder_linear_units=384,
-        decoder_linear_units=384,
-        is_training=False,
-        speech_decoder_postnet_units=128,
-        speech_decoder_postnet_layers=2,
-        pitch_predictor_layers=1,
-        energy_predictor_layers=1,
-        duration_predictor_layers=1,
-        num_mel_bins=8,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.vocab_size = hidden_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.encoder_linear_units = encoder_linear_units
-        self.decoder_linear_units = decoder_linear_units
-        self.speech_decoder_postnet_units = speech_decoder_postnet_units
-        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
-        self.pitch_predictor_layers = pitch_predictor_layers
-        self.energy_predictor_layers = energy_predictor_layers
-        self.duration_predictor_layers = duration_predictor_layers
-        self.num_mel_bins = num_mel_bins
-
-    def prepare_config_and_inputs(self):
-        config = self.get_config()
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        return config, input_ids
-
-    def get_config(self):
-        return FastSpeech2ConformerConfig(
-            hidden_size=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_linear_units=self.encoder_linear_units,
-            decoder_linear_units=self.decoder_linear_units,
-            speech_decoder_postnet_units=self.speech_decoder_postnet_units,
-            speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
-            num_mel_bins=self.num_mel_bins,
-            pitch_predictor_layers=self.pitch_predictor_layers,
-            energy_predictor_layers=self.energy_predictor_layers,
-            duration_predictor_layers=self.duration_predictor_layers,
-        )
-
-    def create_and_check_model(self, config, input_ids, *args):
-        model = FastSpeech2ConformerModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, return_dict=True)
-
-        # total of 5 keys in result
-        self.parent.assertEqual(len(result), 5)
-        # check batch sizes match
-        for value in result.values():
-            self.parent.assertEqual(value.shape[0], self.batch_size)
-        # check duration, pitch, and energy have the appopriate shapes
-        # duration: (batch_size, max_text_length), pitch and energy: (batch_size, max_text_length, 1)
-        self.parent.assertEqual(result["duration_outputs"].shape + (1,), result["pitch_outputs"].shape)
-        self.parent.assertEqual(result["pitch_outputs"].shape, result["energy_outputs"].shape)
-        # check predicted mel-spectrogram has correct dimension
-        self.parent.assertEqual(result["spectrogram"].shape[2], model.config.num_mel_bins)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_ids = self.prepare_config_and_inputs()
-        inputs_dict = {"input_ids": input_ids}
-        return config, inputs_dict
-
-
-@require_mindspore
-class FastSpeech2ConformerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FastSpeech2ConformerModel,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-    test_resize_embeddings = False
-    is_encoder_decoder = True
-
-    def setUp(self):
-        self.model_tester = FastSpeech2ConformerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=FastSpeech2ConformerConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):   # pylint: disable=forgotten-debug-statement
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_initialization(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    msg = f"Parameter {name} of model {model_class} seems not properly initialized"
-                    if "norm" in name:
-                        if "bias" in name:
-                            self.assertEqual(param.data.mean().item(), 0.0, msg=msg)
-                        if "weight" in name:
-                            self.assertEqual(param.data.mean().item(), 1.0, msg=msg)
-                    elif "conv" in name or "embed" in name:
-                        self.assertTrue(-1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=msg)
-
-    def test_duration_energy_pitch_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = self.model_tester.seq_length
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # duration
-            self.assertListEqual(list(outputs.duration_outputs.shape), [self.model_tester.batch_size, seq_len])
-            # energy
-            self.assertListEqual(list(outputs.energy_outputs.shape), [self.model_tester.batch_size, seq_len, 1])
-            # pitch
-            self.assertListEqual(list(outputs.pitch_outputs.shape), [self.model_tester.batch_size, seq_len, 1])
-
-    def test_hidden_states_output(self):
-        def _check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            for idx, hidden_states in enumerate([outputs.encoder_hidden_states, outputs.decoder_hidden_states]):
-                expected_num_layers = getattr(
-                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-                )
-
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                self.assertIsInstance(hidden_states, (list, tuple))
-                expected_batch_size, expected_seq_length, expected_hidden_size = hidden_states[0].shape
-                self.assertEqual(expected_batch_size, self.model_tester.batch_size)
-                # Only test encoder seq_length since decoder seq_length is variable based on inputs
-                if idx == 0:
-                    self.assertEqual(expected_seq_length, self.model_tester.seq_length)
-                self.assertEqual(expected_hidden_size, self.model_tester.hidden_size)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        inputs_dict["output_hidden_states"] = True
-        _check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerModel)
-
-        # check that output_hidden_states also work using config
-        del inputs_dict["output_hidden_states"]
-        config.output_hidden_states = True
-
-        _check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerModel)
-
-    def test_save_load_strict(self):
-        config, _ = self.model_tester.prepare_config_and_inputs()
-        model = FastSpeech2ConformerModel(config)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            _, info = FastSpeech2ConformerModel.from_pretrained(tmpdirname, output_loading_info=True)
-        self.assertEqual(info["missing_keys"], [])
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        model = FastSpeech2ConformerModel(config)
-        signature = inspect.signature(model.forward)
-        # signature.parameters is an OrderedDict => so arg_names order is deterministic
-        arg_names = [*signature.parameters.keys()]
-
-        expected_arg_names = [
-            "input_ids",
-            "attention_mask",
-            "spectrogram_labels",
-            "duration_labels",
-            "pitch_labels",
-            "energy_labels",
-            "speaker_ids",
-            "lang_ids",
-            "speaker_embedding",
-            "return_dict",
-            "output_attentions",
-            "output_hidden_states",
-        ]
-        self.assertListEqual(arg_names, expected_arg_names)
-
-    # Override as FastSpeech2Conformer does not output cross attentions
-    @unittest.skip(reason="MindSpore has no retain_grad")
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        model = FastSpeech2ConformerModel(config)
-        model.set_train(False)
-
-        inputs = self._prepare_for_class(inputs_dict, FastSpeech2ConformerModel)
-
-        outputs = model(**inputs)
-
-        output = outputs[0]
-
-        encoder_hidden_states = outputs.encoder_hidden_states[0]
-        encoder_hidden_states.retain_grad()
-
-        decoder_hidden_states = outputs.decoder_hidden_states[0]
-        decoder_hidden_states.retain_grad()
-
-        encoder_attentions = outputs.encoder_attentions[0]
-        encoder_attentions.retain_grad()
-
-        decoder_attentions = outputs.decoder_attentions[0]
-        decoder_attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(encoder_hidden_states.grad)
-        self.assertIsNotNone(decoder_hidden_states.grad)
-        self.assertIsNotNone(encoder_attentions.grad)
-        self.assertIsNotNone(decoder_attentions.grad)
-
-    def test_attention_outputs(self):
-        """
-        Custom `test_attention_outputs` since FastSpeech2Conformer does not output cross attentions, has variable
-        decoder attention shape, and uniquely outputs energy, pitch, and durations.
-        """
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = self.model_tester.seq_length
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            self.assertEqual(len(outputs.encoder_attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            encoder_attentions = outputs.encoder_attentions
-            self.assertEqual(len(encoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(encoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_len, seq_len],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 7
-            self.assertEqual(out_len, correct_outlen)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            added_hidden_states = 2
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_len, seq_len],
-            )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="FastSpeech2Conformer does not accept inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="FastSpeech2Conformer has no input embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(
-        "FastSpeech2Conformer predicts durations in linear domain during inference"
-        "Even small differences on hidden states lead to different durations, due to `ops.round`"
-    )
-    def test_batching_equivalence(self):
-        pass
-
-
-@require_mindspore
-@require_g2p_en
-@slow
-class FastSpeech2ConformerModelIntegrationTest(unittest.TestCase):
-    def test_inference_integration(self):
-        model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
-        model.set_train(False)
-
-        tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
-        text = "Test that this generates speech"
-        input_ids = tokenizer(text, return_tensors="ms")["input_ids"]
-
-        outputs_dict = model(input_ids)
-        spectrogram = outputs_dict["spectrogram"]
-
-        # mel-spectrogram is too large (1, 205, 80), so only check top-left 100 elements
-        # fmt: off
-        expected_mel_spectrogram = mindspore.tensor(
-            [
-                [-1.2426, -1.7286, -1.6754, -1.7451, -1.6402, -1.5219, -1.4480, -1.3345, -1.4031, -1.4497],
-                [-0.7858, -1.4966, -1.3602, -1.4876, -1.2949, -1.0723, -1.0021, -0.7553, -0.6521, -0.6929],
-                [-0.7298, -1.3908, -1.0369, -1.2656, -1.0342, -0.7883, -0.7420, -0.5249, -0.3734, -0.3977],
-                [-0.4784, -1.3508, -1.1558, -1.4678, -1.2820, -1.0252, -1.0868, -0.9006, -0.8947, -0.8448],
-                [-0.3963, -1.2895, -1.2813, -1.6147, -1.4658, -1.2560, -1.4134, -1.2650, -1.3255, -1.1715],
-                [-1.4914, -1.3097, -0.3821, -0.3898, -0.5748, -0.9040, -1.0755, -1.0575, -1.2205, -1.0572],
-                [0.0197, -0.0582, 0.9147, 1.1512, 1.1651, 0.6628, -0.1010, -0.3085, -0.2285, 0.2650],
-                [1.1780, 0.1803, 0.7251, 1.5728, 1.6678, 0.4542, -0.1572, -0.1787, 0.0744, 0.8168],
-                [-0.2078, -0.3211, 1.1096, 1.5085, 1.4632, 0.6299, -0.0515, 0.0589, 0.8609, 1.4429],
-                [0.7831, -0.2663, 1.0352, 1.4489, 0.9088, 0.0247, -0.3995, 0.0078, 1.2446, 1.6998],
-            ],
-        )
-        # fmt: on
-
-        self.assertTrue(np.allclose(spectrogram[0, :10, :10].numpy(), expected_mel_spectrogram.numpy(), atol=1e-4))
-        self.assertEqual(spectrogram.shape, (1, 205, model.config.num_mel_bins))
-
-    @unittest.skip("Test input is randomized")
-    def test_training_integration(self):
-        model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
-        # Set self.training manually to keep deterministic but run the training path
-        model.training = True
-        set_seed(0)
-
-        tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
-        text = "Test that this generates speech"
-        input_ids = tokenizer(text, return_tensors="ms")["input_ids"]
-
-        # NOTE: Dummy numbers since FastSpeech2Conformer does not have a feature extractor due to the package deps required (librosa, MFA)
-        # 输入的一部分是随机生成的，无法保证与 transformers 生成得一致
-        batch_size, max_text_len = input_ids.shape
-        pitch_labels = ops.rand((batch_size, max_text_len, 1), dtype=mindspore.float32)
-        energy_labels = ops.rand((batch_size, max_text_len, 1), dtype=mindspore.float32)
-        duration_labels = ops.normal((batch_size, max_text_len), 10, 2).clamp(1, 20).int()
-        max_target_len = duration_labels.sum(axis=1).max(axis=0)
-        max_target_len = max_target_len.item()
-        spectrogram_labels = ops.rand(
-            (batch_size, max_target_len, model.num_mel_bins), dtype=mindspore.float32
-        )
-
-        outputs_dict = model(
-            input_ids,
-            spectrogram_labels=spectrogram_labels,
-            duration_labels=duration_labels,
-            pitch_labels=pitch_labels,
-            energy_labels=energy_labels,
-            return_dict=True,
-        )
-        spectrogram = outputs_dict["spectrogram"]
-        loss = outputs_dict["loss"]
-
-        # # mel-spectrogram is too large (1, 224, 80), so only check top-left 100 elements
-        # fmt: off
-        expected_mel_spectrogram = mindspore.tensor(
-            [
-                [-1.0643e+00, -6.8058e-01, -1.0901e+00, -8.2724e-01, -7.7241e-01, -1.1905e+00, -8.5725e-01, -8.2930e-01, -1.1313e+00, -1.2449e+00],
-                [-5.5067e-01, -2.7045e-01, -6.3483e-01, -1.9320e-01,  1.0234e-01, -3.3253e-01, -2.4423e-01, -3.5045e-01, -5.2070e-01, -4.3710e-01],
-                [ 2.2181e-01,  3.1433e-01, -1.2849e-01,  6.0253e-01,  1.0033e+00, 1.3952e-01,  1.2851e-01, -2.3063e-02, -1.5092e-01,  2.4903e-01],
-                [ 4.6343e-01,  4.1820e-01,  1.6468e-01,  1.1297e+00,  1.4588e+00, 1.3737e-01,  6.6355e-02, -6.0973e-02, -5.4225e-02,  5.9208e-01],
-                [ 5.2762e-01,  4.8725e-01,  4.2735e-01,  1.4392e+00,  1.7398e+00, 2.4891e-01, -8.4531e-03, -8.1282e-02,  1.2857e-01,  8.7559e-01],
-                [ 5.2548e-01,  5.1653e-01,  5.2034e-01,  1.3782e+00,  1.5972e+00, 1.6380e-01, -5.1807e-02,  1.5474e-03,  2.2824e-01,  8.5288e-01],
-                [ 3.6356e-01,  4.4109e-01,  4.4257e-01,  9.4273e-01,  1.1201e+00, -9.0551e-03, -1.1627e-01, -2.0821e-02,  1.0793e-01,  5.0336e-01],
-                [ 3.6598e-01,  3.2708e-01,  1.3297e-01,  4.5162e-01,  6.4168e-01, -2.6923e-01, -2.3101e-01, -1.4943e-01, -1.4732e-01,  7.3057e-02],
-                [ 2.7639e-01,  2.2588e-01, -1.5310e-01,  1.0957e-01,  3.3048e-01, -5.3431e-01, -3.3822e-01, -2.8007e-01, -3.3823e-01, -1.5775e-01],
-                [ 2.9323e-01,  1.6723e-01, -3.4153e-01, -1.1209e-01,  1.7355e-01, -6.1724e-01, -5.4201e-01, -4.9944e-01, -5.2212e-01, -2.7596e-01]
-            ],
-        )
-        # fmt: on
-
-        expected_loss = mindspore.tensor(74.4595)
-
-        self.assertTrue(np.allclose(spectrogram[0, :10, :10].numpy(), expected_mel_spectrogram.numpy(), atol=1e-3))
-        self.assertTrue(np.allclose(loss.numpy(), expected_loss.numpy(), atol=1e-4))
-        self.assertEqual(spectrogram.shape, (1, 224, model.config.num_mel_bins))
-
-
-class FastSpeech2ConformerWithHifiGanTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        num_hidden_layers=1,
-        num_attention_heads=2,
-        hidden_size=24,
-        seq_length=7,
-        encoder_linear_units=384,
-        decoder_linear_units=384,
-        is_training=False,
-        speech_decoder_postnet_units=128,
-        speech_decoder_postnet_layers=2,
-        pitch_predictor_layers=1,
-        energy_predictor_layers=1,
-        duration_predictor_layers=1,
-        num_mel_bins=8,
-        upsample_initial_channel=64,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.vocab_size = hidden_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.encoder_linear_units = encoder_linear_units
-        self.decoder_linear_units = decoder_linear_units
-        self.speech_decoder_postnet_units = speech_decoder_postnet_units
-        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
-        self.pitch_predictor_layers = pitch_predictor_layers
-        self.energy_predictor_layers = energy_predictor_layers
-        self.duration_predictor_layers = duration_predictor_layers
-        self.num_mel_bins = num_mel_bins
-        self.upsample_initial_channel = upsample_initial_channel
-
-    def prepare_config_and_inputs(self):
-        config = self.get_config()
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        return config, input_ids
-
-    def get_config(self):
-        self.model_config = FastSpeech2ConformerConfig(
-            hidden_size=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_linear_units=self.encoder_linear_units,
-            decoder_linear_units=self.decoder_linear_units,
-            speech_decoder_postnet_units=self.speech_decoder_postnet_units,
-            speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
-            num_mel_bins=self.num_mel_bins,
-            pitch_predictor_layers=self.pitch_predictor_layers,
-            energy_predictor_layers=self.energy_predictor_layers,
-            duration_predictor_layers=self.duration_predictor_layers,
-        )
-        self.vocoder_config = FastSpeech2ConformerHifiGanConfig(
-            model_in_dim=self.num_mel_bins, upsample_initial_channel=self.upsample_initial_channel
-        )
-        return FastSpeech2ConformerWithHifiGanConfig(
-            model_config=self.model_config.to_dict(), vocoder_config=self.vocoder_config.to_dict()
-        )
-
-    def create_and_check_model(self, config, input_ids, *args):
-        model = FastSpeech2ConformerWithHifiGan(config=config)
-        model.set_train(False)
-        result = model(input_ids, return_dict=True)
-
-        # total of 5 keys in result
-        self.parent.assertEqual(len(result), 6)
-        # check batch sizes match
-        for value in result.values():
-            self.parent.assertEqual(value.shape[0], self.batch_size)
-        # check duration, pitch, and energy have the appopriate shapes
-        # duration: (batch_size, max_text_length), pitch and energy: (batch_size, max_text_length, 1)
-        self.parent.assertEqual(result["duration_outputs"].shape + (1,), result["pitch_outputs"].shape)
-        self.parent.assertEqual(result["pitch_outputs"].shape, result["energy_outputs"].shape)
-        # check predicted mel-spectrogram has correct dimension
-        self.parent.assertEqual(result["spectrogram"].shape[2], model.config.model_config.num_mel_bins)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_ids = self.prepare_config_and_inputs()
-        inputs_dict = {"input_ids": input_ids}
-        return config, inputs_dict
-
-
-@require_mindspore
-class FastSpeech2ConformerWithHifiGanTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FastSpeech2ConformerWithHifiGan,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-    test_resize_embeddings = False
-    is_encoder_decoder = True
-
-    def setUp(self):
-        self.model_tester = FastSpeech2ConformerWithHifiGanTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_initialization(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    msg = f"Parameter {name} of model {model_class} seems not properly initialized"
-                    if "norm" in name:
-                        if "bias" in name:
-                            self.assertEqual(param.data.mean().item(), 0.0, msg=msg)
-                        if "weight" in name:
-                            self.assertEqual(param.data.mean().item(), 1.0, msg=msg)
-                    elif "conv" in name or "embed" in name:
-                        self.assertTrue(-1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, msg=msg)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        return inputs_dict
-
-    def test_duration_energy_pitch_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.model_config.return_dict = True
-
-        seq_len = self.model_tester.seq_length
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # duration
-            self.assertListEqual(list(outputs.duration_outputs.shape), [self.model_tester.batch_size, seq_len])
-            # energy
-            self.assertListEqual(list(outputs.energy_outputs.shape), [self.model_tester.batch_size, seq_len, 1])
-            # pitch
-            self.assertListEqual(list(outputs.pitch_outputs.shape), [self.model_tester.batch_size, seq_len, 1])
-
-    def test_hidden_states_output(self):
-        def _check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            for idx, hidden_states in enumerate([outputs.encoder_hidden_states, outputs.decoder_hidden_states]):
-                expected_num_layers = getattr(
-                    self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-                )
-
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                self.assertIsInstance(hidden_states, (list, tuple))
-                expected_batch_size, expected_seq_length, expected_hidden_size = hidden_states[0].shape
-                self.assertEqual(expected_batch_size, self.model_tester.batch_size)
-                # Only test encoder seq_length since decoder seq_length is variable based on inputs
-                if idx == 0:
-                    self.assertEqual(expected_seq_length, self.model_tester.seq_length)
-                self.assertEqual(expected_hidden_size, self.model_tester.hidden_size)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        inputs_dict["output_hidden_states"] = True
-        _check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerWithHifiGan)
-
-        # check that output_hidden_states also work using config
-        del inputs_dict["output_hidden_states"]
-        config.model_config.output_hidden_states = True
-
-        _check_hidden_states_output(inputs_dict, config, FastSpeech2ConformerWithHifiGan)
-
-    def test_save_load_strict(self):
-        config, _ = self.model_tester.prepare_config_and_inputs()
-        model = FastSpeech2ConformerWithHifiGan(config)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            _, info = FastSpeech2ConformerWithHifiGan.from_pretrained(tmpdirname, output_loading_info=True)
-        self.assertEqual(info["missing_keys"], [])
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        model = FastSpeech2ConformerWithHifiGan(config)
-        signature = inspect.signature(model.forward)
-        # signature.parameters is an OrderedDict => so arg_names order is deterministic
-        arg_names = [*signature.parameters.keys()]
-
-        expected_arg_names = [
-            "input_ids",
-            "attention_mask",
-            "spectrogram_labels",
-            "duration_labels",
-            "pitch_labels",
-            "energy_labels",
-            "speaker_ids",
-            "lang_ids",
-            "speaker_embedding",
-            "return_dict",
-            "output_attentions",
-            "output_hidden_states",
-        ]
-        self.assertListEqual(arg_names, expected_arg_names)
-
-    # Override as FastSpeech2Conformer does not output cross attentions
-    @unittest.skip(reason="MindSpore has no retain_grad")
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.model_config.output_hidden_states = True
-        config.model_config.output_attentions = True
-
-        model = FastSpeech2ConformerWithHifiGan(config)
-        model.set_train(False)
-
-        inputs = self._prepare_for_class(inputs_dict, FastSpeech2ConformerModel)
-
-        outputs = model(**inputs)
-
-        output = outputs[0]
-
-        encoder_hidden_states = outputs.encoder_hidden_states[0]
-        encoder_hidden_states.retain_grad()
-
-        decoder_hidden_states = outputs.decoder_hidden_states[0]
-        decoder_hidden_states.retain_grad()
-
-        encoder_attentions = outputs.encoder_attentions[0]
-        encoder_attentions.retain_grad()
-
-        decoder_attentions = outputs.decoder_attentions[0]
-        decoder_attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(encoder_hidden_states.grad)
-        self.assertIsNotNone(decoder_hidden_states.grad)
-        self.assertIsNotNone(encoder_attentions.grad)
-        self.assertIsNotNone(decoder_attentions.grad)
-
-    def test_attention_outputs(self):
-        """
-        Custom `test_attention_outputs` since FastSpeech2Conformer does not output cross attentions, has variable
-        decoder attention shape, and uniquely outputs energy, pitch, and durations.
-        """
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.model_config.return_dict = True
-
-        seq_len = self.model_tester.seq_length
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.model_config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            self.assertEqual(len(outputs.encoder_attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.model_config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            encoder_attentions = outputs.encoder_attentions
-            self.assertEqual(len(encoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(encoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_len, seq_len],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 8
-            self.assertEqual(out_len, correct_outlen)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            added_hidden_states = 2
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_len, seq_len],
-            )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer")
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="FastSpeech2Conformer does not accept inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="FastSpeech2Conformer has no input embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(
-        "FastSpeech2Conformer predicts durations in linear domain during inference"
-        "Even small differences on hidden states lead to different durations, due to `ops.round`"
-    )
-    def test_batching_equivalence(self):
-        pass
-
-
-@require_mindspore
-@require_g2p_en
-@slow
-class FastSpeech2ConformerWithHifiGanIntegrationTest(unittest.TestCase):
-    def test_inference_integration(self):
-        model = FastSpeech2ConformerWithHifiGan.from_pretrained("espnet/fastspeech2_conformer_with_hifigan")
-        model.set_train(False)
-
-        tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
-        text = "Test that this generates speech"
-        input_ids = tokenizer(text, return_tensors="ms")["input_ids"]
-
-        output = model(input_ids)
-        waveform = output.waveform
-
-        # waveform is too large (1, 52480), so only check first 100 elements
-        # fmt: off
-        expected_waveform = mindspore.tensor(
-            [
-                [-9.6345e-04,  1.3557e-03,  5.7559e-04,  2.4706e-04,  2.2675e-04, 1.2258e-04,  4.7784e-04,  1.0109e-03, -1.9718e-04,  6.3495e-04, 3.2106e-04,  6.3620e-05,  9.1713e-04, -2.5664e-05,  1.9596e-04, 6.0418e-04,  8.1112e-04,  3.6342e-04, -6.3396e-04, -2.0146e-04, -1.1768e-04,  4.3155e-04,  7.5599e-04, -2.2972e-04, -9.5665e-05, 3.3078e-04,  1.3793e-04, -1.4932e-04, -3.9645e-04,  3.6473e-05, -1.7224e-04, -4.5370e-05, -4.8950e-04, -4.3059e-04,  1.0451e-04, -1.0485e-03, -6.0410e-04,  1.6990e-04, -2.1997e-04, -3.8769e-04, -7.6898e-04, -3.2372e-04, -1.9783e-04,  5.2896e-05, -1.0586e-03, -7.8516e-04,  7.6867e-04, -8.5331e-05, -4.8158e-04, -4.5362e-05, -1.0770e-04,  6.6823e-04,  3.0765e-04,  3.3669e-04,  9.5677e-04, 1.0458e-03,  5.8129e-04,  3.3737e-04,  1.0816e-03,  7.0346e-04, 4.2378e-04,  4.3131e-04,  2.8095e-04,  1.2201e-03,  5.6121e-04, -1.1086e-04,  4.9908e-04,  1.5586e-04,  4.2046e-04, -2.8088e-04, -2.2462e-04, -1.5539e-04, -7.0126e-04, -2.8577e-04, -3.3693e-04, -1.2471e-04, -6.9104e-04, -1.2867e-03, -6.2651e-04, -2.5586e-04, -1.3201e-04, -9.4537e-04, -4.8438e-04,  4.1458e-04,  6.4109e-04, 1.0891e-04, -6.3764e-04,  4.5573e-04,  8.2974e-04,  3.2973e-06, -3.8274e-04, -2.0400e-04,  4.9922e-04,  2.1508e-04, -1.1009e-04, -3.9763e-05,  3.0576e-04,  3.1485e-05, -2.7574e-05,  3.3856e-04],
-            ],
-        )
-        # fmt: on
-
-        self.assertTrue(np.allclose(waveform[0, :100].numpy(), expected_waveform.numpy(), atol=1e-4))
-        self.assertEqual(waveform.shape, (1, 52480))
diff --git a/tests/transformers/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py b/tests/transformers/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py
deleted file mode 100644
index 61ffe9f71..000000000
--- a/tests/transformers/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for the FastSpeech2Conformer tokenizer."""
-# pylint: disable=line-too-long
-
-import unittest
-
-from mindnlp.transformers import FastSpeech2ConformerTokenizer
-from mindnlp.utils.testing_utils import require_g2p_en, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-@require_g2p_en
-class FastSpeech2ConformerTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "espnet/fastspeech2_conformer"
-    tokenizer_class = FastSpeech2ConformerTokenizer
-    test_rust_tokenizer = False
-
-    def setUp(self):
-        super().setUp()
-        tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "this is a test"
-        output_text = "this is a test"
-        return input_text, output_text
-
-    # Custom `get_clean_sequence` since FastSpeech2ConformerTokenizer can't decode id -> string
-    def get_clean_sequence(self, tokenizer, with_prefix_space=False, **kwargs):  # max_length=20, min_length=5
-        input_text, output_text = self.get_input_output_texts(tokenizer)
-        ids = tokenizer.encode(output_text, add_special_tokens=False)
-        return output_text, ids
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = "<unk>"
-        token_id = 1
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "<blank>")
-        self.assertEqual(vocab_keys[1], "<unk>")
-        self.assertEqual(vocab_keys[-4], "UH0")
-        self.assertEqual(vocab_keys[-2], "..")
-        self.assertEqual(vocab_keys[-1], "<sos/eos>")
-        self.assertEqual(len(vocab_keys), 78)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 78)
-
-    @unittest.skip(
-        "FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
-    )
-    def test_added_token_are_matched_longest_first(self):
-        pass
-
-    @unittest.skip(
-        "FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
-    )
-    def test_added_tokens_do_lower_case(self):
-        pass
-
-    @unittest.skip(
-        "FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
-    )
-    def test_tokenize_special_tokens(self):
-        pass
-
-    def test_full_tokenizer(self):
-        tokenizer = self.get_tokenizer()
-
-        tokens = tokenizer.tokenize("This is a test")
-        ids = [9, 12, 6, 12, 11, 2, 4, 15, 6, 4, 77]
-        self.assertListEqual(tokens, ["DH", "IH1", "S", "IH1", "Z", "AH0", "T", "EH1", "S", "T", "<sos/eos>"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), ids)
-        self.assertListEqual(tokenizer.convert_ids_to_tokens(ids), tokens)
-
-    @slow
-    def test_tokenizer_integration(self):
-        # Custom test since:
-        # 1) This tokenizer only decodes to tokens (phonemes cannot be converted to text with complete accuracy)
-        # 2) Uses a sequence without numbers since espnet has different, custom number conversion.
-        # This tokenizer can phonemize numbers, but where in espnet "32" is phonemized as "thirty two",
-        # here "32" is phonemized as "thirty-two" because we haven't implemented the custom number handling.
-
-        sequences = [
-            "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
-            "general-purpose architectures (BERT, GPT, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
-            "Language Understanding (NLU) and Natural Language Generation (NLG) with over thirty-two pretrained "
-            "models in one hundred plus languages and deep interoperability between Jax, PyTorch and TensorFlow.",
-            "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
-            "conditioning on both left and right context in all layers.",
-            "The quick brown fox jumps over the lazy dog.",
-        ]
-        tokenizer = FastSpeech2ConformerTokenizer.from_pretrained(
-            "espnet/fastspeech2_conformer", revision="07f9c4a2d6bbc69b277d87d2202ad1e35b05e113"
-        )
-        actual_encoding = tokenizer(sequences)
-
-        # fmt: off
-        expected_encoding = {
-            'input_ids': [
-                [4, 7, 60, 3, 6, 22, 30, 7, 14, 21, 11, 22, 30, 7, 14, 21, 8, 29, 3, 34, 3, 18, 11, 17, 12, 4, 21, 10, 4, 7, 60, 3, 6, 22, 30, 7, 14, 21, 11, 2, 3, 5, 17, 12, 4, 21, 10, 17, 7, 29, 4, 7, 31, 3, 5, 25, 38, 4, 17, 7, 2, 20, 32, 5, 11, 40, 15, 3, 21, 2, 8, 17, 38, 17, 2, 6, 24, 7, 10, 2, 4, 45, 10, 39, 21, 11, 25, 38, 4, 23, 37, 15, 4, 6, 23, 7, 2, 25, 38, 4, 2, 23, 11, 8, 15, 14, 11, 23, 5, 13, 6, 4, 12, 8, 4, 21, 25, 23, 11, 8, 15, 3, 39, 2, 8, 1, 22, 30, 7, 3, 18, 39, 21, 2, 8, 8, 18, 36, 37, 16, 2, 40, 62, 3, 5, 21, 6, 4, 18, 3, 5, 13, 36, 3, 8, 28, 2, 3, 5, 3, 18, 39, 21, 2, 8, 8, 18, 36, 37, 16, 2, 40, 40, 45, 3, 21, 31, 35, 2, 3, 15, 8, 36, 16, 12, 9, 34, 20, 21, 43, 38, 5, 29, 4, 28, 17, 7, 29, 4, 7, 31, 3, 5, 14, 24, 5, 2, 8, 11, 13, 3, 16, 19, 3, 26, 19, 3, 5, 7, 2, 5, 17, 8, 19, 6, 8, 18, 36, 37, 16, 2, 40, 2, 11, 2, 3, 5, 5, 27, 17, 49, 3, 4, 21, 2, 17, 21, 25, 12, 8, 2, 4, 29, 25, 13, 4, 16, 27, 3, 40, 18, 10, 6, 23, 17, 12, 4, 21, 10, 2, 3, 5, 4, 15, 3, 6, 21, 8, 46, 22, 33, 77],
-                [25, 38, 4, 12, 11, 5, 13, 11, 32, 3, 5, 4, 28, 17, 7, 27, 4, 7, 31, 3, 5, 27, 17, 25, 51, 5, 13, 7, 15, 10, 35, 2, 3, 2, 8, 7, 45, 17, 7, 2, 11, 2, 3, 4, 31, 35, 2, 3, 11, 22, 7, 19, 14, 2, 3, 8, 31, 25, 2, 8, 5, 4, 15, 10, 6, 4, 25, 32, 40, 55, 3, 4, 8, 29, 10, 2, 3, 5, 12, 35, 2, 3, 13, 36, 24, 3, 25, 34, 43, 8, 15, 22, 4, 2, 3, 5, 7, 32, 4, 10, 24, 3, 4, 54, 10, 6, 4, 13, 3, 30, 8, 8, 31, 21, 11, 33, 77],
-                [9, 2, 10, 16, 12, 10, 25, 7, 42, 3, 22, 24, 10, 6, 40, 19, 14, 17, 6, 34, 20, 21, 9, 2, 8, 31, 11, 29, 5, 30, 37, 33, 77]
-            ],
-            'attention_mask': [
-                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-            ]
-        }
-        # fmt: on
-
-        actual_tokens = [tokenizer.decode(input_ids) for input_ids in expected_encoding["input_ids"]]
-        expected_tokens = [
-            [tokenizer.convert_ids_to_tokens(id) for id in sequence] for sequence in expected_encoding["input_ids"]
-        ]
-
-        self.assertListEqual(actual_encoding["input_ids"], expected_encoding["input_ids"])
-        self.assertListEqual(actual_encoding["attention_mask"], expected_encoding["attention_mask"])
-        self.assertTrue(actual_tokens == expected_tokens)
-
-    @unittest.skip(
-        reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
-    )
-    def test_add_tokens_tokenizer(self):
-        pass
-
-    @unittest.skip(
-        reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
-    )
-    def test_add_special_tokens(self):
-        pass
-
-    @unittest.skip(
-        reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
-    )
-    def test_added_token_serializable(self):
-        pass
-
-    @unittest.skip(
-        reason="FastSpeech2Conformer tokenizer does not support adding tokens as they can't be added to the g2p_en backend"
-    )
-    def test_save_and_load_tokenizer(self):
-        pass
-
-    @unittest.skip(reason="Phonemes cannot be reliably converted to string due to one-many mapping")
-    def test_internal_consistency(self):
-        pass
-
-    @unittest.skip(reason="Phonemes cannot be reliably converted to string due to one-many mapping")
-    def test_encode_decode_with_spaces(self):
-        pass
-
-    @unittest.skip(reason="Phonemes cannot be reliably converted to string due to one-many mapping")
-    def test_convert_tokens_to_string_format(self):
-        pass
-
-    @unittest.skip("FastSpeech2Conformer tokenizer does not support pairs.")
-    def test_maximum_encoding_length_pair_input(self):
-        pass
-
-    @unittest.skip(
-        "FastSpeech2Conformer tokenizer appends eos_token to each string it's passed, including `is_split_into_words=True`."
-    )
-    def test_pretokenized_inputs(self):
-        pass
-
-    @unittest.skip(
-        reason="g2p_en is slow is with large inputs and max encoding length is not a concern for FastSpeech2Conformer"
-    )
-    def test_maximum_encoding_length_single_input(self):
-        pass
-
-    @unittest.skip(reason="no pretrained tokenizer for FastSpeech-Conformer model")
-    def test_pretrained_model_lists(self):
-        pass
diff --git a/tests/transformers/models/flaubert/__init__.py b/tests/transformers/models/flaubert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/flaubert/test_modeling_flaubert.py b/tests/transformers/models/flaubert/test_modeling_flaubert.py
deleted file mode 100644
index 6899c1c47..000000000
--- a/tests/transformers/models/flaubert/test_modeling_flaubert.py
+++ /dev/null
@@ -1,460 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-from mindnlp.transformers import FlaubertConfig, is_mindspore_available
-from mindnlp.utils import is_sacremoses_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-from mindnlp.core import ops
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-if is_mindspore_available():
-    from mindnlp.transformers import (
-        FlaubertForMultipleChoice,
-        FlaubertForQuestionAnswering,
-        FlaubertForQuestionAnsweringSimple,
-        FlaubertForSequenceClassification,
-        FlaubertForTokenClassification,
-        FlaubertModel,
-        FlaubertWithLMHeadModel,
-    )
-    from mindnlp.transformers.models.flaubert.modeling_flaubert import create_sinusoidal_embeddings
-
-    import mindspore
-
-class FlaubertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_lengths=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        gelu_activation=True,
-        sinusoidal_embeddings=False,
-        causal=False,
-        asm=False,
-        n_langs=2,
-        vocab_size=99,
-        n_special=0,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=12,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        summary_type="last",
-        use_proj=None,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_lengths = use_input_lengths
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.gelu_activation = gelu_activation
-        self.sinusoidal_embeddings = sinusoidal_embeddings
-        self.causal = causal
-        self.asm = asm
-        self.n_langs = n_langs
-        self.vocab_size = vocab_size
-        self.n_special = n_special
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.summary_type = summary_type
-        self.use_proj = use_proj
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        input_lengths = None
-        if self.use_input_lengths:
-            input_lengths = (
-                ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
-            )  # small variation of seq_length
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
-
-        sequence_labels = None
-        token_labels = None
-        is_impossible_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            is_impossible_labels = ids_tensor([self.batch_size], 2).float()
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            choice_labels,
-            input_mask,
-        )
-
-    def get_config(self):
-        return FlaubertConfig(
-            vocab_size=self.vocab_size,
-            n_special=self.n_special,
-            emb_dim=self.hidden_size,
-            n_layers=self.num_hidden_layers,
-            n_heads=self.num_attention_heads,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            gelu_activation=self.gelu_activation,
-            sinusoidal_embeddings=self.sinusoidal_embeddings,
-            asm=self.asm,
-            causal=self.causal,
-            n_langs=self.n_langs,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            summary_type=self.summary_type,
-            use_proj=self.use_proj,
-        )
-
-    def create_and_check_flaubert_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = FlaubertModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, lengths=input_lengths, langs=token_type_ids)
-        result = model(input_ids, langs=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_flaubert_lm_head(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = FlaubertWithLMHeadModel(config)
-        model.set_train(False)
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_flaubert_simple_qa(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = FlaubertForQuestionAnsweringSimple(config)
-        model.set_train(False)
-
-        result = model(input_ids)
-
-        result = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_flaubert_qa(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = FlaubertForQuestionAnswering(config)
-        model.set_train(False)
-
-        result = model(input_ids)
-
-        result_with_labels = model(
-            input_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-            cls_index=sequence_labels,
-            is_impossible=is_impossible_labels,
-            p_mask=input_mask,
-        )
-
-        result_with_labels = model(
-            input_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-            cls_index=sequence_labels,
-            is_impossible=is_impossible_labels,
-        )
-
-        (total_loss,) = result_with_labels.to_tuple()
-
-        result_with_labels = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
-
-        (total_loss,) = result_with_labels.to_tuple()
-
-        self.parent.assertEqual(result_with_labels.loss.shape, ())
-        self.parent.assertEqual(result.start_top_log_probs.shape, (self.batch_size, model.config.start_n_top))
-        self.parent.assertEqual(result.start_top_index.shape, (self.batch_size, model.config.start_n_top))
-        self.parent.assertEqual(
-            result.end_top_log_probs.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top)
-        )
-        self.parent.assertEqual(
-            result.end_top_index.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top)
-        )
-        self.parent.assertEqual(result.cls_logits.shape, (self.batch_size,))
-
-    def create_and_check_flaubert_sequence_classif(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = FlaubertForSequenceClassification(config)
-        model.set_train(False)
-
-        result = model(input_ids)
-        result = model(input_ids, labels=sequence_labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def create_and_check_flaubert_token_classif(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        config.num_labels = self.num_labels
-        model = FlaubertForTokenClassification(config)
-        model.set_train(False)
-
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_flaubert_multiple_choice(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        config.num_choices = self.num_choices
-        model = FlaubertForMultipleChoice(config=config)
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            choice_labels,
-            input_mask,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "lengths": input_lengths,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class FlaubertModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            FlaubertModel,
-            FlaubertWithLMHeadModel,
-            FlaubertForQuestionAnswering,
-            FlaubertForQuestionAnsweringSimple,
-            FlaubertForSequenceClassification,
-            FlaubertForTokenClassification,
-            FlaubertForMultipleChoice,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": FlaubertModel,
-            "fill-mask": FlaubertWithLMHeadModel,
-            "question-answering": FlaubertForQuestionAnsweringSimple,
-            "text-classification": FlaubertForSequenceClassification,
-            "token-classification": FlaubertForTokenClassification,
-            "zero-shot": FlaubertForSequenceClassification,
-        }
-        if is_mindspore_available() and is_sacremoses_available()
-        else {}
-    )
-
-    # Flaubert has 2 QA models -> need to manually set the correct labels for one of them here
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "FlaubertForQuestionAnswering":
-                inputs_dict["start_positions"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-                inputs_dict["end_positions"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = FlaubertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=FlaubertConfig, emb_dim=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_flaubert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_flaubert_model(*config_and_inputs)
-
-    # Copied from tests/models/distilbert/test_modeling_distilbert.py with Distilbert->Flaubert
-    def test_flaubert_model_with_sinusoidal_encodings(self):
-        config = FlaubertConfig(sinusoidal_embeddings=True)
-        model = FlaubertModel(config=config)
-        sinusoidal_pos_embds = ops.zeros((config.max_position_embeddings, config.emb_dim), dtype=mindspore.float32)
-        create_sinusoidal_embeddings(config.max_position_embeddings, config.emb_dim, sinusoidal_pos_embds)
-        self.model_tester.parent.assertTrue(ops.equal(model.position_embeddings.weight, sinusoidal_pos_embds).all())
-
-    def test_flaubert_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_flaubert_lm_head(*config_and_inputs)
-
-    def test_flaubert_simple_qa(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_flaubert_simple_qa(*config_and_inputs)
-
-    def test_flaubert_qa(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_flaubert_qa(*config_and_inputs)
-
-    def test_flaubert_sequence_classif(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_flaubert_sequence_classif(*config_and_inputs)
-
-    def test_flaubert_token_classif(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_flaubert_token_classif(*config_and_inputs)
-
-    def test_flaubert_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_flaubert_multiple_choice(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "flaubert/flaubert_small_cased"
-        model = FlaubertModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class FlaubertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        model = FlaubertModel.from_pretrained("flaubert/flaubert_base_cased")
-        input_ids = mindspore.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = (1, 11, 768)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[[-2.6251, -1.4298, -0.0227], [-2.8510, -1.6387, 0.2258], [-2.8114, -1.1832, -0.3066]]]
-        )
-        self.assertTrue(ops.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/transformers/models/flaubert/test_tokenization_flaubert.py b/tests/transformers/models/flaubert/test_tokenization_flaubert.py
deleted file mode 100644
index af0916ea0..000000000
--- a/tests/transformers/models/flaubert/test_tokenization_flaubert.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the FlauBERT tokenizer."""
-
-import json
-import os
-import unittest
-
-from mindnlp.transformers import FlaubertTokenizer
-from mindnlp.transformers.models.flaubert.tokenization_flaubert import VOCAB_FILES_NAMES
-from mindnlp.utils.testing_utils import slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-class FlaubertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "flaubert/flaubert_base_cased"
-    tokenizer_class = FlaubertTokenizer
-    test_rust_tokenizer = False
-
-    def setUp(self):
-        super().setUp()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>", "t</w>", "i</w>", "lo", "low", "ne", "new", "er</w>", "low</w>", "lowest</w>", "new</w>", "newer</w>", "wider</w>", "<unk>"]  # fmt: skip
-
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["n e 300", "ne w 301", "e r</w> 302", ""]
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    @unittest.skip("not support yet.")
-    def test_pretrained_model_lists(self):
-        # We should have at least one default checkpoint for each tokenizer
-        # We should specify the max input length as well (used in some part to list the pretrained checkpoints)
-        self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
-        self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
-        self.assertEqual(
-            len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]),
-            len(self.tokenizer_class.max_model_input_sizes),
-        )
-
-    # Copied from transformers.tests.models.xlm.test_tokenization_xlm.XLMTokenizationTest.test_full_tokenizer
-    def test_full_tokenizer(self):
-        tokenizer = self.get_tokenizer()
-        text = "lower newer"
-        bpe_tokens = ["l", "o", "w", "er</w>", "new", "er</w>"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [0, 1, 2, 18, 17, 18, 24]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    @slow
-    # Copied from transformers.tests.models.xlm.test_tokenization_xlm.XLMTokenizationTest.test_sequence_builders
-    def test_sequence_builders(self):
-        tokenizer = FlaubertTokenizer.from_pretrained("flaubert/flaubert_base_cased")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-        print(encoded_sentence)
-        print(encoded_sentence)
-
-        assert encoded_sentence == [0] + text + [1]
-        assert encoded_pair == [0] + text + [1] + text_2 + [1]
diff --git a/tests/transformers/models/flava/__init__.py b/tests/transformers/models/flava/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/flava/test_image_processing_flava.py b/tests/transformers/models/flava/test_image_processing_flava.py
deleted file mode 100644
index 9e697886e..000000000
--- a/tests/transformers/models/flava/test_image_processing_flava.py
+++ /dev/null
@@ -1,386 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Meta Platforms authors and HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import numpy as np
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, is_mindspore_available
-from mindnlp.utils import is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-if is_vision_available():
-    import PIL
-
-    from mindnlp.transformers import FlavaImageProcessor
-    from mindnlp.transformers.image_utils import PILImageResampling
-    from mindnlp.transformers.models.flava.image_processing_flava import (
-        FLAVA_CODEBOOK_MEAN,
-        FLAVA_CODEBOOK_STD,
-        FLAVA_IMAGE_MEAN,
-        FLAVA_IMAGE_STD,
-    )
-else:
-    FLAVA_IMAGE_MEAN = FLAVA_IMAGE_STD = FLAVA_CODEBOOK_MEAN = FLAVA_CODEBOOK_STD = None
-
-
-class FlavaImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        resample=None,
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_normalize=True,
-        image_mean=FLAVA_IMAGE_MEAN,
-        image_std=FLAVA_IMAGE_STD,
-        input_size_patches=14,
-        total_mask_patches=75,
-        mask_group_max_patches=None,
-        mask_group_min_patches=16,
-        mask_group_min_aspect_ratio=0.3,
-        mask_group_max_aspect_ratio=None,
-        codebook_do_resize=True,
-        codebook_size=None,
-        codebook_resample=None,
-        codebook_do_center_crop=True,
-        codebook_crop_size=None,
-        codebook_do_map_pixels=True,
-        codebook_do_normalize=True,
-        codebook_image_mean=FLAVA_CODEBOOK_MEAN,
-        codebook_image_std=FLAVA_CODEBOOK_STD,
-    ):
-        size = size if size is not None else {"height": 224, "width": 224}
-        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
-        codebook_size = codebook_size if codebook_size is not None else {"height": 112, "width": 112}
-        codebook_crop_size = codebook_crop_size if codebook_crop_size is not None else {"height": 112, "width": 112}
-
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.do_resize = do_resize
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.size = size
-        self.resample = resample if resample is not None else PILImageResampling.BICUBIC
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-
-        self.input_size_patches = input_size_patches
-        self.total_mask_patches = total_mask_patches
-        self.mask_group_max_patches = mask_group_max_patches
-        self.mask_group_min_patches = mask_group_min_patches
-        self.mask_group_min_aspect_ratio = mask_group_min_aspect_ratio
-        self.mask_group_max_aspect_ratio = mask_group_max_aspect_ratio
-
-        self.codebook_do_resize = codebook_do_resize
-        self.codebook_size = codebook_size
-        self.codebook_resample = codebook_resample if codebook_resample is not None else PILImageResampling.LANCZOS
-        self.codebook_do_center_crop = codebook_do_center_crop
-        self.codebook_crop_size = codebook_crop_size
-        self.codebook_do_map_pixels = codebook_do_map_pixels
-        self.codebook_do_normalize = codebook_do_normalize
-        self.codebook_image_mean = codebook_image_mean
-        self.codebook_image_std = codebook_image_std
-
-    def prepare_image_processor_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "resample": self.resample,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "input_size_patches": self.input_size_patches,
-            "total_mask_patches": self.total_mask_patches,
-            "mask_group_max_patches": self.mask_group_max_patches,
-            "mask_group_min_patches": self.mask_group_min_patches,
-            "mask_group_min_aspect_ratio": self.mask_group_min_aspect_ratio,
-            "mask_group_max_aspect_ratio": self.mask_group_min_aspect_ratio,
-            "codebook_do_resize": self.codebook_do_resize,
-            "codebook_size": self.codebook_size,
-            "codebook_resample": self.codebook_resample,
-            "codebook_do_center_crop": self.codebook_do_center_crop,
-            "codebook_crop_size": self.codebook_crop_size,
-            "codebook_do_map_pixels": self.codebook_do_map_pixels,
-            "codebook_do_normalize": self.codebook_do_normalize,
-            "codebook_image_mean": self.codebook_image_mean,
-            "codebook_image_std": self.codebook_image_std,
-        }
-
-    def get_expected_image_size(self):
-        return (self.size["height"], self.size["width"])
-
-    def get_expected_mask_size(self):
-        return (
-            (self.input_size_patches, self.input_size_patches)
-            if not isinstance(self.input_size_patches, tuple)
-            else self.input_size_patches
-        )
-
-    def get_expected_codebook_image_size(self):
-        return (self.codebook_size["height"], self.codebook_size["width"])
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class FlavaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = FlavaImageProcessor if is_vision_available() else None
-    maxDiff = None
-
-    def setUp(self):
-        self.image_processor_tester = FlavaImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "resample"))
-        self.assertTrue(hasattr(image_processing, "crop_size"))
-        self.assertTrue(hasattr(image_processing, "do_center_crop"))
-        self.assertTrue(hasattr(image_processing, "do_rescale"))
-        self.assertTrue(hasattr(image_processing, "rescale_factor"))
-        self.assertTrue(hasattr(image_processing, "masking_generator"))
-        self.assertTrue(hasattr(image_processing, "codebook_do_resize"))
-        self.assertTrue(hasattr(image_processing, "codebook_size"))
-        self.assertTrue(hasattr(image_processing, "codebook_resample"))
-        self.assertTrue(hasattr(image_processing, "codebook_do_center_crop"))
-        self.assertTrue(hasattr(image_processing, "codebook_crop_size"))
-        self.assertTrue(hasattr(image_processing, "codebook_do_map_pixels"))
-        self.assertTrue(hasattr(image_processing, "codebook_do_normalize"))
-        self.assertTrue(hasattr(image_processing, "codebook_image_mean"))
-        self.assertTrue(hasattr(image_processing, "codebook_image_std"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"height": 224, "width": 224})
-        self.assertEqual(image_processor.crop_size, {"height": 224, "width": 224})
-        self.assertEqual(image_processor.codebook_size, {"height": 112, "width": 112})
-        self.assertEqual(image_processor.codebook_crop_size, {"height": 112, "width": 112})
-
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42, crop_size=84, codebook_size=33, codebook_crop_size=66
-        )
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
-        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
-        self.assertEqual(image_processor.codebook_size, {"height": 33, "width": 33})
-        self.assertEqual(image_processor.codebook_crop_size, {"height": 66, "width": 66})
-
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, PIL.Image.Image)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="ms")
-
-        # Test no bool masked pos
-        self.assertFalse("bool_masked_pos" in encoded_images)
-
-        expected_height, expected_width = self.image_processor_tester.get_expected_image_size()
-
-        self.assertEqual(
-            encoded_images.pixel_values.shape,
-            (1, self.image_processor_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="ms")
-        expected_height, expected_width = self.image_processor_tester.get_expected_image_size()
-
-        # Test no bool masked pos
-        self.assertFalse("bool_masked_pos" in encoded_images)
-
-        self.assertEqual(
-            encoded_images.pixel_values.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def _test_call_framework(self, instance_class, prepare_kwargs):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, **prepare_kwargs)
-        for image in image_inputs:
-            self.assertIsInstance(image, instance_class)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="ms")
-
-        expected_height, expected_width = self.image_processor_tester.get_expected_image_size()
-        self.assertEqual(
-            encoded_images.pixel_values.shape,
-            (1, self.image_processor_tester.num_channels, expected_height, expected_width),
-        )
-
-        encoded_images = image_processing(image_inputs, return_image_mask=True, return_tensors="ms")
-
-        expected_height, expected_width = self.image_processor_tester.get_expected_image_size()
-        self.assertEqual(
-            encoded_images.pixel_values.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-        expected_height, expected_width = self.image_processor_tester.get_expected_mask_size()
-        self.assertEqual(
-            encoded_images.bool_masked_pos.shape,
-            (
-                self.image_processor_tester.batch_size,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="ms").pixel_values
-
-        expected_height, expected_width = self.image_processor_tester.get_expected_image_size()
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-        # Test masking
-        encoded_images = image_processing(image_inputs, return_image_mask=True, return_tensors="ms")
-
-        expected_height, expected_width = self.image_processor_tester.get_expected_image_size()
-        self.assertEqual(
-            encoded_images.pixel_values.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-        expected_height, expected_width = self.image_processor_tester.get_expected_mask_size()
-        self.assertEqual(
-            encoded_images.bool_masked_pos.shape,
-            (
-                self.image_processor_tester.batch_size,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-    def test_call_numpy(self):
-        self._test_call_framework(np.ndarray, prepare_kwargs={"numpify": True})
-
-    def test_call_numpy_4_channels(self):
-        self.image_processing_class.num_channels = 4
-        self._test_call_framework(np.ndarray, prepare_kwargs={"numpify": True})
-        self.image_processing_class.num_channels = 3
-
-    def test_call_pytorch(self):
-        self._test_call_framework(mindspore.Tensor, prepare_kwargs={"torchify": True})
-
-    def test_masking(self):
-        # Initialize image_processing
-        random.seed(1234)
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_image_mask=True, return_tensors="ms")
-        self.assertEqual(encoded_images.bool_masked_pos.sum().item(), 75)
-
-    def test_codebook_pixels(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, PIL.Image.Image)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_codebook_pixels=True, return_tensors="ms")
-        expected_height, expected_width = self.image_processor_tester.get_expected_codebook_image_size()
-        self.assertEqual(
-            encoded_images.codebook_pixel_values.shape,
-            (1, self.image_processor_tester.num_channels, expected_height, expected_width),
-        )
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_codebook_pixels=True, return_tensors="ms")
-        expected_height, expected_width = self.image_processor_tester.get_expected_codebook_image_size()
-        self.assertEqual(
-            encoded_images.codebook_pixel_values.shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
diff --git a/tests/transformers/models/flava/test_modeling_flava.py b/tests/transformers/models/flava/test_modeling_flava.py
deleted file mode 100644
index de1b98db1..000000000
--- a/tests/transformers/models/flava/test_modeling_flava.py
+++ /dev/null
@@ -1,1332 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Meta Platforms authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the Mindspore FLAVA model."""
-
-import inspect
-import os
-import random
-import tempfile
-import unittest
-
-import numpy as np
-import requests
-
-from mindnlp.transformers import (
-    FlavaConfig,
-    FlavaImageCodebookConfig,
-    FlavaImageConfig,
-    FlavaMultimodalConfig,
-    FlavaTextConfig,
-)
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import is_mindspore_available, is_vision_available
-from mindspore import ops
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn
-
-    from mindnlp.transformers import (
-        FlavaForPreTraining,
-        FlavaImageCodebook,
-        FlavaImageModel,
-        FlavaModel,
-        FlavaMultimodalModel,
-        FlavaTextModel,
-    )
-else:
-    FlavaModel = None
-    FlavaForPreTraining = None
-    mindspore = {}
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import FlavaProcessor
-
-
-class FlavaImageModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        qkv_bias=True,
-        mask_token=True,
-        vocab_size=99,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.qkv_bias = qkv_bias
-        self.mask_token = mask_token
-        self.vocab_size = vocab_size
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        num_patches = self.image_size // self.patch_size
-        bool_masked_pos = (
-            ops.rand((self.batch_size, num_patches, num_patches)) < 0.9
-        )
-        config = self.get_config()
-        return config, pixel_values, bool_masked_pos
-
-    def get_config(self):
-        return FlavaImageConfig(
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-            layer_norm_eps=self.layer_norm_eps,
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            qkv_bias=self.qkv_bias,
-            mask_token=self.mask_token,
-            vocab_size=self.vocab_size,
-        )
-
-    def create_and_check_model(self, config, pixel_values, bool_masked_pos):
-        model = FlavaImageModel(config=config)
-        model.set_train(False)
-        with mindspore._no_grad():
-            result = model(pixel_values, bool_masked_pos)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, bool_masked_pos = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values, "bool_masked_pos": bool_masked_pos}
-        return config, inputs_dict
-
-
-@require_mindspore
-class FlavaImageModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as FLAVA does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (FlavaImageModel,) if is_mindspore_available() else ()
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = FlavaImageModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=FlavaImageConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_inputs_embeds(self):
-        # FLAVA does not use inputs_embeds
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        # in FLAVA, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.model_tester.image_size, self.model_tester.image_size)
-        patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        seq_len = num_patches + 1
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-            with mindspore._no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-            with mindspore._no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-            with mindspore._no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_len, seq_len],
-            )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            with mindspore._no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            # FLAVA has a different seq_length
-            image_size = (self.model_tester.image_size, self.model_tester.image_size)
-            patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
-            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-            seq_length = num_patches + 1
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    # skip this test as FlavaImageModel has no base class and is
-    # not available in MODEL_MAPPING
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    # skip this test as FlavaImageModel has no base class and is
-    # not available in MODEL_MAPPING
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/flava-full"
-        model = FlavaImageModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class FlavaTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        vocab_size=102,
-        type_vocab_size=2,
-        max_position_embeddings=512,
-        position_embedding_type="absolute",
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        pad_token_id=0,
-        qkv_bias=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.seq_length = seq_length
-        self.vocab_size = vocab_size
-        self.type_vocab_size = type_vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.position_embedding_type = position_embedding_type
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.qkv_bias = qkv_bias
-        self.pad_token_id = pad_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        # if input_mask is not None:
-        #     batch_size, seq_length = input_mask.shape
-        #     rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-        #     for batch_idx, start_index in enumerate(rnd_start_indices):
-        #         input_mask[batch_idx, :start_index] = 1
-        #         input_mask[batch_idx, start_index:] = 0
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                # a = input_mask.asnumpy()
-                # a[batch_idx, :int(start_index)] = 1
-                input_mask[batch_idx, :int(start_index)] = 1 # mindspore.Tensor(1, dtype=mindspore.int64)
-                input_mask[batch_idx, int(start_index):] = 0 # mindspore.Tensor(0, dtype=mindspore.int64)
-                # input_mask[batch_idx, :start_index] = 1 mindspore.Tensor(1)
-                # input_mask[batch_idx, start_index:] = 0
-                # ops.scatter_nd_update(input_mask,
-                #                       ops.stack([ops.full((int(start_index),), batch_idx), ops.arange(mindspore.tensor(start_index))], axis=1),
-                #                       ops.full((int(start_index),), 1))
-                # ops.scatter_nd_update(input_mask,
-                #                       ops.stack([ops.full((input_mask.shape[1] - int(start_index),), batch_idx), ops.arange(mindspore.tensor(input_mask.shape[1] - start_index))], axis=1),
-                #                       ops.full((input_mask.shape[1] - int(start_index),), 0))
-
-
-        token_type_ids = None
-
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask
-
-
-
-    def get_config(self):
-        return FlavaTextConfig(
-            vocab_size=self.vocab_size,
-            type_vocab_size=self.type_vocab_size,
-            max_position_embeddings=self.max_position_embeddings,
-            position_embedding_type=self.position_embedding_type,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-            layer_norm_eps=self.layer_norm_eps,
-            pad_token_id=self.pad_token_id,
-            qkv_bias=self.qkv_bias,
-        )
-
-    def create_and_check_model(self, config, input_ids, token_type_ids, input_mask):
-        model = FlavaTextModel(config=config)
-        model.set_train(False)
-        with mindspore._no_grad():
-            result = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask)
-            result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, token_type_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class FlavaTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlavaTextModel,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = FlavaTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=FlavaTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    def test_inputs_embeds(self):
-        # FLAVA does not use inputs_embeds
-        pass
-
-    # skip this test as FlavaTextModel has no base class and is
-    # not available in MODEL_MAPPING
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    # skip this test as FlavaTextModel has no base class and is
-    # not available in MODEL_MAPPING
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/flava-full"
-        model = FlavaTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class FlavaMultimodalModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=44,
-        use_input_mask=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        qkv_bias=True,
-        ce_ignore_index=-100,
-        use_cls_token=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.use_input_mask = use_input_mask
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.qkv_bias = qkv_bias
-        self.ce_ignore_index = ce_ignore_index
-        self.use_cls_token = use_cls_token
-
-    def prepare_config_and_inputs(self):
-        hidden_states = floats_tensor([self.batch_size, self.seq_length - 1, self.hidden_size])
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                # input_mask[batch_idx, :start_index] = 1
-                # input_mask[batch_idx, start_index:] = 0
-                # ops.scatter_nd_update(input_mask,
-                #                       ops.stack([ops.full((int(start_index),), batch_idx), ops.arange(mindspore.tensor(start_index))], axis=1),
-                #                       ops.full((int(start_index),), 1))
-                # ops.scatter_nd_update(input_mask,
-                #                       ops.stack([ops.full((input_mask.shape[1] - int(start_index),), batch_idx), ops.arange(mindspore.tensor(input_mask.shape[1] - start_index))], axis=1),
-                #                       ops.full((input_mask.shape[1] - int(start_index),), 0))
-                input_mask[batch_idx, :int(start_index)] = 1 #mindspore.Tensor(1, dtype=mindspore.int64)
-                input_mask[batch_idx, int(start_index):] = 0 # mindspore.Tensor(0, dtype=mindspore.int64)
-
-        config = self.get_config()
-
-        return config, hidden_states, input_mask
-
-    def get_config(self):
-        return FlavaMultimodalConfig(
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-            layer_norm_eps=self.layer_norm_eps,
-            qkv_bias=self.qkv_bias,
-            use_cls_token=self.use_cls_token,
-            ce_ignore_index=self.ce_ignore_index,
-        )
-
-    def create_and_check_model(self, config, hidden_states, input_mask):
-        model = FlavaMultimodalModel(config=config)
-        model.set_train(False)
-        with mindspore._no_grad():
-            result = model(hidden_states, attention_mask=input_mask)
-            result = model(hidden_states)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, hidden_states, input_mask = config_and_inputs
-        inputs_dict = {"hidden_states": hidden_states, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class FlavaMultimodalModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlavaMultimodalModel,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_head_masking = False
-    test_resize_embeddings = False
-
-    def setUp(self):
-        self.model_tester = FlavaMultimodalModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=FlavaMultimodalConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["hidden_states"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model_get_set_embeddings(self):
-        # No embedding in multimodal model
-        pass
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    def test_inputs_embeds(self):
-        # FLAVA does not use inputs_embeds
-        pass
-
-    # skip this test as FlavaMultimodalModel has no base class and is
-    # not available in MODEL_MAPPING
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    # skip this test as FlavaMultimodalModel has no base class and is
-    # not available in MODEL_MAPPING
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/flava-full"
-        model = FlavaMultimodalModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class FlavaImageCodebookTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=112,
-        num_channels=3,
-        hidden_size=32,
-        num_groups=2,
-        vocab_size=99,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.hidden_size = hidden_size
-        self.num_groups = num_groups
-        self.vocab_size = vocab_size
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return FlavaImageCodebookConfig(
-            hidden_size=self.hidden_size, num_groups=self.num_groups, vocab_size=self.vocab_size
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = FlavaImageCodebook(config=config)
-        model.set_train(False)
-        with mindspore._no_grad():
-            result = model(pixel_values)
-        self.parent.assertEqual(
-            result.shape, (self.batch_size, config.vocab_size, self.image_size // 8, self.image_size // 8)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class FlavaImageCodebookTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlavaImageCodebook,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_head_masking = False
-    test_resize_embeddings = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = FlavaImageCodebookTester(self)
-        self.config_tester = ConfigTester(self, config_class=FlavaImageCodebookConfig, has_text_modality=False)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    @unittest.skip(reason="Flava does not output attentions")
-    def test_attention_outputs(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        # No embedding in multimodal model
-        pass
-
-    def test_training(self):
-        pass
-
-    def test_hidden_states_output(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # no attentions
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    def test_inputs_embeds(self):
-        # FLAVA does not use inputs_embeds
-        pass
-
-    def test_model_outputs_equivalence(self):
-        pass
-
-    # skip this test as FlavaImageCodebook has no base class and is
-    # not available in MODEL_MAPPING
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    # skip this test as FlavaImageCodebook has no base class and is
-    # not available in MODEL_MAPPING
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/flava-full"
-        model = FlavaImageCodebook.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class FlavaModelTester:
-    model_class = FlavaModel
-
-    def __init__(
-        self,
-        parent,
-        text_kwargs=None,
-        image_kwargs=None,
-        multimodal_kwargs=None,
-        image_codebook_kwargs=None,
-        is_training=True,
-        hidden_size=32,
-        projection_dim=32,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-    ):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if image_kwargs is None:
-            image_kwargs = {}
-        if multimodal_kwargs is None:
-            multimodal_kwargs = {}
-        if image_codebook_kwargs is None:
-            image_codebook_kwargs = {}
-
-        self.parent = parent
-        self.image_model_tester = FlavaImageModelTester(parent, **image_kwargs)
-        self.text_model_tester = FlavaTextModelTester(parent, **text_kwargs)
-        self.multimodal_model_tester = FlavaMultimodalModelTester(parent, **multimodal_kwargs)
-        self.image_codebook_tester = FlavaImageCodebookTester(parent, **image_codebook_kwargs)
-        self.is_training = is_training
-        self.config_tester = ConfigTester(self, config_class=FlavaConfig, hidden_size=37)
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def prepare_config_and_inputs_for_common(self):
-        _, pixel_values, bool_masked_pos = self.image_model_tester.prepare_config_and_inputs()
-        _, input_ids, token_type_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "bool_masked_pos": bool_masked_pos,
-        }
-
-    def get_config(self):
-        return FlavaConfig.from_configs(
-            self.image_model_tester.get_config(),
-            self.text_model_tester.get_config(),
-            self.multimodal_model_tester.get_config(),
-            self.image_codebook_tester.get_config(),
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            initializer_range=self.initializer_range,
-            layer_norm_eps=self.layer_norm_eps,
-        )
-
-    def create_and_check_model(self, config, inputs):
-        self._test_model(config, inputs, test_image=True)
-        self._test_model(config, inputs, test_text=True)
-        self._test_model(config, inputs, test_image=True, test_text=True)
-
-    def _test_model(self, config, inputs, test_image=False, test_text=False):
-        model = self.model_class(config).set_train(False)
-        with mindspore._no_grad():
-            result = model(
-                input_ids=inputs["input_ids"] if test_text else None,
-                attention_mask=inputs["attention_mask"] if test_text else None,
-                token_type_ids=inputs["token_type_ids"] if test_text else None,
-                pixel_values=inputs["pixel_values"] if test_image else None,
-                bool_masked_pos=inputs["bool_masked_pos"] if test_image else None,
-            )
-        image_size = (self.image_model_tester.image_size, self.image_model_tester.image_size)
-        patch_size = (self.image_model_tester.patch_size, self.image_model_tester.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-
-        if test_image:
-            self.parent.assertEqual(
-                result.image_embeddings.shape,
-                (self.image_model_tester.batch_size, num_patches + 1, self.image_model_tester.hidden_size),
-            )
-        else:
-            self.parent.assertIsNone(result.image_embeddings)
-
-        if test_text:
-            self.parent.assertEqual(
-                result.text_embeddings.shape,
-                (
-                    self.text_model_tester.batch_size,
-                    self.text_model_tester.seq_length,
-                    self.text_model_tester.hidden_size,
-                ),
-            )
-        else:
-            self.parent.assertIsNone(result.text_embeddings)
-
-        if test_image and test_text:
-            self.parent.assertEqual(
-                result.multimodal_embeddings.shape,
-                (
-                    self.multimodal_model_tester.batch_size,
-                    self.text_model_tester.seq_length + num_patches + 2,
-                    self.multimodal_model_tester.hidden_size,
-                ),
-            )
-        else:
-            self.parent.assertIsNone(result.multimodal_embeddings)
-
-
-@require_mindspore
-class FlavaModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (FlavaModel,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"feature-extraction": FlavaModel} if is_mindspore_available() else {}
-    class_for_tester = FlavaModelTester
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = self.class_for_tester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # hidden_states are tested in individual model tests
-    def test_hidden_states_output(self):
-        pass
-
-    # input_embeds are tested in individual model tests
-    def test_inputs_embeds(self):
-        pass
-
-    # tested in individual model tests
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    # FlavaModel does not have input/output embeddings
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # override as the `logit_scale` parameter initilization is different for FLAVA
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale" or name == "flava.logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-
-    def test_load_image_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save FlavaConfig and check if we can load FlavaImageConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            image_config = FlavaImageConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.image_config.to_dict(), image_config.to_dict())
-
-        # Save FlavaConfig and check if we can load FlavaTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = FlavaTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-        # Save FlavaConfig and check if we can load FlavaMultimodalConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            multimodal_config = FlavaMultimodalConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.multimodal_config.to_dict(), multimodal_config.to_dict())
-
-    # overwrite from common since FlavaModel/TFFlavaModel return FLAVAOutput/TFFLAVAOutput
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/flava-full"
-        model = FlavaModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class FlavaForPreTrainingTester(FlavaModelTester):
-    model_class = FlavaForPreTraining
-
-    def prepare_config_and_inputs_for_common(self):
-        _, pixel_values, bool_masked_pos = self.image_model_tester.prepare_config_and_inputs()
-        _, input_ids, token_type_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        config = self.get_config()
-
-        # input_ids_masked = input_ids.detach().clone()
-        input_ids_masked = mindspore.Tensor(input_ids.asnumpy(), dtype=input_ids.dtype)
-        input_ids_masked.stop_gradient = True
-        input_ids_masked[:, 1:3] = 100
-        # mlm_labels = input_ids.detach().clone()
-        mlm_labels = mindspore.Tensor(input_ids.asnumpy(), dtype=input_ids.dtype)
-        mlm_labels.stop_gradient = True
-        mlm_labels[:, :] = config.ce_ignore_index
-        mlm_labels[:, 1:3] = input_ids[:, 1:3]
-        mim_labels = ops.randint(
-            0, self.image_model_tester.vocab_size, bool_masked_pos.shape
-        )
-        mim_labels[bool_masked_pos.ne(True)] = config.ce_ignore_index
-        itm_labels = ops.ones(mlm_labels.shape[0], dtype=mindspore.int64) # .int64
-
-        return config, {
-            "input_ids": input_ids,
-            "input_ids_masked": input_ids_masked,
-            "token_type_ids": token_type_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "bool_masked_pos": bool_masked_pos,
-            "mlm_labels": mlm_labels,
-            "mim_labels": mim_labels,
-            "itm_labels": itm_labels,
-            "return_loss": True,
-        }
-
-    def _test_model(self, config, inputs, test_image=False, test_text=False):
-        model = self.model_class(config).set_train(False)
-        with mindspore._no_grad():
-            result = model(
-                input_ids=inputs["input_ids"] if test_text else None,
-                input_ids_masked=inputs["input_ids_masked"] if test_text else None,
-                attention_mask=inputs["attention_mask"] if test_text else None,
-                token_type_ids=inputs["token_type_ids"] if test_text else None,
-                pixel_values=inputs["pixel_values"] if test_image else None,
-                bool_masked_pos=inputs["bool_masked_pos"] if test_image else None,
-                mlm_labels=inputs["mlm_labels"],
-                mim_labels=inputs["mim_labels"],
-                itm_labels=inputs["itm_labels"],
-                return_loss=inputs["return_loss"],
-            )
-        image_size = (self.image_model_tester.image_size, self.image_model_tester.image_size)
-        patch_size = (self.image_model_tester.patch_size, self.image_model_tester.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-
-        if test_image:
-            self.parent.assertEqual(
-                result.image_embeddings.shape,
-                (self.image_model_tester.batch_size, num_patches + 1, self.image_model_tester.hidden_size),
-            )
-            if not test_text:
-                self.parent.assertEqual(
-                    result.loss_info.mim.dim(),
-                    0,
-                )
-                self.parent.assertEqual(
-                    result.mim_logits.shape,
-                    (inputs["bool_masked_pos"].sum().item(), self.image_model_tester.vocab_size),
-                )
-
-        else:
-            self.parent.assertIsNone(result.image_embeddings)
-
-        if test_text:
-            self.parent.assertEqual(
-                result.text_embeddings.shape,
-                (
-                    self.text_model_tester.batch_size,
-                    self.text_model_tester.seq_length,
-                    self.text_model_tester.hidden_size,
-                ),
-            )
-            if not test_image:
-                self.parent.assertEqual(result.loss_info.mlm.dim(), 0)
-                self.parent.assertEqual(
-                    result.mlm_logits.shape,
-                    (
-                        (inputs["mlm_labels"] != self.multimodal_model_tester.ce_ignore_index).sum().item(),
-                        self.text_model_tester.vocab_size,
-                    ),
-                )
-        else:
-            self.parent.assertIsNone(result.text_embeddings)
-
-        if test_image and test_text:
-            self.parent.assertEqual(
-                result.multimodal_masked_embeddings.shape,
-                (
-                    self.multimodal_model_tester.batch_size,
-                    self.text_model_tester.seq_length + num_patches + 2,
-                    self.multimodal_model_tester.hidden_size,
-                ),
-            )
-            self.parent.assertEqual(
-                result.itm_logits.shape,
-                (self.text_model_tester.batch_size, 2),
-            )
-            self.parent.assertEqual(
-                result.mmm_text_logits.shape,
-                (
-                    (inputs["mlm_labels"] != self.multimodal_model_tester.ce_ignore_index).sum().item(),
-                    self.text_model_tester.vocab_size,
-                ),
-            )
-            self.parent.assertEqual(
-                result.mmm_image_logits.shape,
-                (inputs["bool_masked_pos"].sum().item(), self.image_model_tester.vocab_size),
-            )
-            self.parent.assertEqual(
-                result.contrastive_logits_per_image.shape,
-                (self.image_model_tester.batch_size, self.text_model_tester.batch_size),
-            )
-            self.parent.assertEqual(
-                result.contrastive_logits_per_text.shape,
-                (self.text_model_tester.batch_size, self.image_model_tester.batch_size),
-            )
-
-            for item in [
-                result.loss_info.global_contrastive,
-                result.loss_info.itm,
-                result.loss_info.mmm_text,
-                result.loss_info.mmm_image,
-            ]:
-                self.parent.assertEqual(item.dim(), 0)
-
-            for item in [result.loss_info.mim, result.loss_info.mlm]:
-                self.parent.assertIsNone(item)
-
-        else:
-            self.parent.assertIsNone(result.multimodal_masked_embeddings)
-            for item in [
-                result.loss_info.global_contrastive,
-                result.loss_info.itm,
-                result.loss_info.mmm_text,
-                result.loss_info.mmm_image,
-            ]:
-                self.parent.assertIsNone(item)
-
-        self.parent.assertIsNone(result.multimodal_embeddings)
-
-
-@require_mindspore
-class FlavaForPreTrainingTest(FlavaModelTest):
-    all_model_classes = (FlavaForPreTraining,) if is_mindspore_available() else ()
-    class_for_tester = FlavaForPreTrainingTester
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_vision
-@require_mindspore
-class FlavaModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "facebook/flava-full"
-        # model = FlavaModel.from_pretrained(model_name, ms_dtype=mindspore.float16)
-        # processor = FlavaProcessor.from_pretrained(model_name, ms_dtype=mindspore.float16)
-
-        model = FlavaModel.from_pretrained(model_name)
-        processor = FlavaProcessor.from_pretrained(model_name)
-        image = prepare_img()
-        inputs = processor(
-            text=["a photo of a cat", "a photo of a dog"],
-            images=[image, image],
-            padding="max_length",
-            max_length=77,
-            return_tensors="ms",
-        )
-        # inputs["pixel_values"] = inputs["pixel_values"].astype(mindspore.float16)
-
-        # forward pass
-        with mindspore._no_grad():
-            outputs = model(**inputs, return_dict=True)
-
-        # verify the embeddings
-        self.assertAlmostEqual(outputs.image_embeddings.sum().item(), -1352.53540, places=3)
-        self.assertAlmostEqual(outputs.text_embeddings.sum().item(), -198.98225, places=3)
-        self.assertAlmostEqual(outputs.multimodal_embeddings.sum().item(), -4030.4602050, places=3)
-
-
-@require_vision
-@require_mindspore
-class FlavaForPreTrainingIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "facebook/flava-full"
-        model = FlavaForPreTraining.from_pretrained(model_name)
-        processor = FlavaProcessor.from_pretrained(model_name)
-        mindspore.set_seed(1)
-        random.seed(1)
-
-        image = prepare_img()
-        inputs = processor(
-            text=["a photo of a cat", "a photo of a dog"],
-            images=[image, image],
-            padding="max_length",
-            max_length=77,
-            return_tensors="ms",
-            return_codebook_pixels=True,
-            return_image_mask=True,
-        )
-        # Create a clone of the input_ids tensor that will be its masked version
-        inputs["input_ids_masked"] = inputs["input_ids"].copy()
-        # Mask the tokens "a" & "cat" from the "a photo of a cat" text using the special 103 value
-        inputs["input_ids_masked"][0, 4:6] = 103
-        # MLM labels. It is a cloned version of input_ids where all values are -100 (i.e., ignored)
-        # except those that are masked, whose original values are stored
-        inputs["mlm_labels"] = inputs["input_ids"].copy()
-        inputs["mlm_labels"][:, :] = -100
-        inputs["mlm_labels"][0, 4:6] = inputs["input_ids"][0, 4:6]
-
-        # forward pass
-        with mindspore._no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(
-            outputs.contrastive_logits_per_image.shape,
-            (inputs.pixel_values.shape[0], inputs.input_ids.shape[0]),
-        )
-        self.assertEqual(
-            outputs.contrastive_logits_per_text.shape,
-            (inputs.input_ids.shape[0], inputs.pixel_values.shape[0]),
-        )
-
-        expected_logits = mindspore.Tensor([[16.1291, 8.4033], [16.1291, 8.4033]])
-        self.assertTrue(np.allclose(outputs.contrastive_logits_per_image.asnumpy(), expected_logits.asnumpy(), atol=1e-3))
-        self.assertAlmostEqual(outputs.loss_info.mmm_text.item(), 2.0727925, places=4)
-        self.assertAlmostEqual(outputs.loss_info.mmm_image.item(), 7.0282096, places=4)
-        self.assertAlmostEqual(outputs.loss.item(), 11.3792324, places=4)
-
-    @slow
-    def test_inference_with_itm_labels(self):
-        model_name = "facebook/flava-full"
-        model = FlavaForPreTraining.from_pretrained(model_name)
-        processor = FlavaProcessor.from_pretrained(model_name)
-        mindspore.set_seed(1)
-        random.seed(1)
-
-        image = prepare_img()
-        inputs = processor(
-            text=["a photo of a cat", "a photo of a dog"],
-            images=[image, image],
-            padding="max_length",
-            max_length=77,
-            return_tensors="ms",
-            return_codebook_pixels=True,
-            return_image_mask=True,
-        )
-        # Create a clone of the input_ids tensor that will be its masked version
-        inputs["input_ids_masked"] = inputs["input_ids"].copy()
-        # Mask the tokens "a" & "cat" from the "a photo of a cat" text using the special 103 value
-        inputs["input_ids_masked"][0, 4:6] = 103
-        # MLM labels. It is a cloned version of input_ids where all values are -100 (i.e., ignored)
-        # except those that are masked, whose original values are stored
-        inputs["mlm_labels"] = inputs["input_ids"].copy()
-        inputs["mlm_labels"][:, :] = -100
-        inputs["mlm_labels"][0, 4:6] = inputs["input_ids"][0, 4:6]
-        # Manually create the itm_labels tensor that indicates if the image-text match.
-        # In this case, the firs pair matches and the second does not
-        inputs["itm_labels"] = mindspore.Tensor([1, 0])
-
-        # forward pass
-        with mindspore._no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(
-            outputs.contrastive_logits_per_image.shape,
-            (1, inputs.input_ids.shape[0]),
-        )
-        self.assertEqual(
-            outputs.contrastive_logits_per_text.shape,
-            (1, inputs.pixel_values.shape[0]),
-        )
-
-        expected_logits = mindspore.Tensor([[16.1291, 8.4033], [16.1291, 8.4033]])
-        self.assertTrue(np.allclose(outputs.contrastive_logits_per_image.asnumpy(), expected_logits.asnumpy(), atol=1e-3))
-        self.assertAlmostEqual(outputs.loss_info.mmm_text.item(), 2.0727925, places=4)
-        self.assertAlmostEqual(outputs.loss_info.mmm_image.item(), 6.8965902, places=4)
-        self.assertAlmostEqual(outputs.loss.item(), 9.6084213, places=4)
diff --git a/tests/transformers/models/flava/test_processor_flava.py b/tests/transformers/models/flava/test_processor_flava.py
deleted file mode 100644
index 02c5b71a2..000000000
--- a/tests/transformers/models/flava/test_processor_flava.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import random
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-import pytest
-
-from mindnlp.transformers import BertTokenizer, BertTokenizerFast
-from mindnlp.transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
-from mindnlp.utils.testing_utils import require_vision
-from mindnlp.utils import is_vision_available
-from mindnlp.configs import IMAGE_PROCESSOR_NAME
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import FlavaImageProcessor, FlavaProcessor
-    from mindnlp.transformers.models.flava.image_processing_flava import (
-        FLAVA_CODEBOOK_MEAN,
-        FLAVA_CODEBOOK_STD,
-        FLAVA_IMAGE_MEAN,
-        FLAVA_IMAGE_STD,
-    )
-
-
-@require_vision
-class FlavaProcessorTest(unittest.TestCase):
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", ",", "low", "lowest"]  # fmt: skip
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write("".join([x + "\n" for x in vocab_tokens]))
-
-        image_processor_map = {
-            "image_mean": FLAVA_IMAGE_MEAN,
-            "image_std": FLAVA_IMAGE_STD,
-            "do_normalize": True,
-            "do_resize": True,
-            "size": 224,
-            "do_center_crop": True,
-            "crop_size": 224,
-            "input_size_patches": 14,
-            "total_mask_patches": 75,
-            "mask_group_max_patches": None,
-            "mask_group_min_patches": 16,
-            "mask_group_min_aspect_ratio": 0.3,
-            "mask_group_max_aspect_ratio": None,
-            "codebook_do_resize": True,
-            "codebook_size": 112,
-            "codebook_do_center_crop": True,
-            "codebook_crop_size": 112,
-            "codebook_do_map_pixels": True,
-            "codebook_do_normalize": True,
-            "codebook_image_mean": FLAVA_CODEBOOK_MEAN,
-            "codebook_image_std": FLAVA_CODEBOOK_STD,
-        }
-
-        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
-        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
-            json.dump(image_processor_map, fp)
-
-    def get_tokenizer(self, **kwargs):
-        return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_image_processor(self, **kwargs):
-        return FlavaImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def prepare_image_inputs(self):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
-
-        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        return image_inputs
-
-    def test_save_load_pretrained_default(self):
-        tokenizer_slow = self.get_tokenizer()
-        tokenizer_fast = self.get_rust_tokenizer()
-        image_processor = self.get_image_processor()
-
-        processor_slow = FlavaProcessor(tokenizer=tokenizer_slow, image_processor=image_processor)
-        processor_slow.save_pretrained(self.tmpdirname)
-        processor_slow = FlavaProcessor.from_pretrained(self.tmpdirname, use_fast=False)
-
-        processor_fast = FlavaProcessor(tokenizer=tokenizer_fast, image_processor=image_processor)
-        processor_fast.save_pretrained(self.tmpdirname)
-        processor_fast = FlavaProcessor.from_pretrained(self.tmpdirname)
-
-        self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab())
-        self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab())
-        self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab())
-        self.assertIsInstance(processor_slow.tokenizer, BertTokenizer)
-        self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast)
-
-        self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertIsInstance(processor_slow.image_processor, FlavaImageProcessor)
-        self.assertIsInstance(processor_fast.image_processor, FlavaImageProcessor)
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = FlavaProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
-
-        processor = FlavaProcessor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, BertTokenizerFast)
-
-        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, FlavaImageProcessor)
-
-    def test_image_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        image_input = self.prepare_image_inputs()
-
-        input_feat_extract = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-        # With rest of the args
-        random.seed(1234)
-        input_feat_extract = image_processor(
-            image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np"
-        )
-        random.seed(1234)
-        input_processor = processor(
-            images=image_input, return_image_mask=True, return_codebook_pixels=True, return_tensors="np"
-        )
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_tokenizer(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tok = tokenizer(input_str)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
-
-        # add extra args
-        inputs = processor(text=input_str, images=image_input, return_codebook_pixels=True, return_image_mask=True)
-
-        self.assertListEqual(
-            list(inputs.keys()),
-            [
-                "input_ids",
-                "token_type_ids",
-                "attention_mask",
-                "pixel_values",
-                "codebook_pixel_values",
-                "bool_masked_pos",
-            ],
-        )
-
-        # test if it raises when no input is passed
-        with pytest.raises(ValueError):
-            processor()
-
-    def test_tokenizer_decode(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_model_input_names(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = FlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertListEqual(list(inputs.keys()), processor.model_input_names)
diff --git a/tests/transformers/models/florence2/__init__.py b/tests/transformers/models/florence2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/florence2/testing_modeling_florence2.py b/tests/transformers/models/florence2/testing_modeling_florence2.py
deleted file mode 100644
index 5f70e1132..000000000
--- a/tests/transformers/models/florence2/testing_modeling_florence2.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import requests
-
-import mindspore
-from PIL import Image
-from mindnlp.transformers.models.florence2 import Florence2ForConditionalGeneration, Florence2Processor
-
-
-model = Florence2ForConditionalGeneration.from_pretrained("microsoft/Florence-2-large", ms_dtype=mindspore.float32)
-processor = Florence2Processor.from_pretrained("microsoft/Florence-2-large")
-
-
-prompt = "<OD>"
-
-url = "https://hf-mirror.com/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
-image = Image.open(requests.get(url, stream=True).raw)
-
-inputs = processor(text=prompt, images=image, return_tensors='ms')
-
-generated_ids = model.generate(
-    input_ids=inputs["input_ids"],
-    pixel_values=inputs["pixel_values"],
-    max_new_tokens=1024,
-    num_beams=3,
-    do_sample=False
-)
-generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-
-parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))
-
-print(parsed_answer)
diff --git a/tests/transformers/models/fnet/__init__.py b/tests/transformers/models/fnet/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/fnet/test_modeling_fnet.py b/tests/transformers/models/fnet/test_modeling_fnet.py
deleted file mode 100644
index 2c8989e91..000000000
--- a/tests/transformers/models/fnet/test_modeling_fnet.py
+++ /dev/null
@@ -1,756 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore FNet model."""
-
-import unittest
-from typing import Dict, List, Tuple
-import numpy as np
-from mindnlp.transformers import FNetConfig
-from mindnlp.utils import is_mindspore_available
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import (
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindnlp.core import ops
-    from mindnlp.transformers.models.auto.modeling_auto import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-    )
-    from mindnlp.transformers import (
-        FNetForMaskedLM,
-        FNetForMultipleChoice,
-        FNetForNextSentencePrediction,
-        FNetForPreTraining,
-        FNetForQuestionAnswering,
-        FNetForSequenceClassification,
-        FNetForTokenClassification,
-        FNetModel,
-        FNetTokenizerFast,
-    )
-    from mindnlp.transformers.models.fnet.modeling_fnet import (
-        FNetBasicFourierTransform,
-        is_scipy_available,
-    )
-
-
-# Override ConfigTester
-class FNetConfigTester(ConfigTester):
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        if self.has_text_modality:
-            self.parent.assertTrue(hasattr(config, "vocab_size"))
-        self.parent.assertTrue(hasattr(config, "hidden_size"))
-        self.parent.assertTrue(hasattr(config, "num_hidden_layers"))
-
-
-class FNetModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor(
-                [self.batch_size, self.seq_length], self.type_vocab_size
-            )
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor(
-                [self.batch_size], self.type_sequence_label_size
-            )
-            token_labels = ids_tensor(
-                [self.batch_size, self.seq_length], self.num_labels
-            )
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return FNetConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            tpu_short_seq_length=self.seq_length,
-        )
-
-    @require_mindspore
-    def create_and_check_fourier_transform(self, config):
-        hidden_states = floats_tensor(
-            [self.batch_size, self.seq_length, config.hidden_size]
-        )
-        transform = FNetBasicFourierTransform(config)
-        fftn_output = transform(hidden_states)
-
-        config.use_tpu_fourier_optimizations = True
-        if is_scipy_available():
-            transform = FNetBasicFourierTransform(config)
-            dft_output = transform(hidden_states)
-
-        config.max_position_embeddings = 4097
-        transform = FNetBasicFourierTransform(config)
-        fft_output = transform(hidden_states)
-
-        if is_scipy_available():
-            self.parent.assertTrue(
-                np.allclose(
-                    fftn_output[0][0].asnumpy(), dft_output[0][0].asnumpy(), atol=1e-4
-                )
-            )
-            self.parent.assertTrue(
-                np.allclose(
-                    fft_output[0][0].asnumpy(), dft_output[0][0].asnumpy(), atol=1e-4
-                )
-            )
-        self.parent.assertTrue(
-            np.allclose(
-                fftn_output[0][0].asnumpy(), fft_output[0][0].asnumpy(), atol=1e-4
-            )
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = FNetModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-
-    def create_and_check_for_pretraining(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = FNetForPreTraining(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            next_sentence_label=sequence_labels,
-        )
-        self.parent.assertEqual(
-            result.prediction_logits.shape,
-            (self.batch_size, self.seq_length, self.vocab_size),
-        )
-        self.parent.assertEqual(
-            result.seq_relationship_logits.shape, (self.batch_size, 2)
-        )
-
-    def create_and_check_for_masked_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = FNetForMaskedLM(config=config)
-        model.set_train(False)
-        result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)
-        )
-
-    def create_and_check_for_next_sentence_prediction(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = FNetForNextSentencePrediction(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            token_type_ids=token_type_ids,
-            next_sentence_label=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_question_answering(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = FNetForQuestionAnswering(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(
-            result.start_logits.shape, (self.batch_size, self.seq_length)
-        )
-        self.parent.assertEqual(
-            result.end_logits.shape, (self.batch_size, self.seq_length)
-        )
-
-    def create_and_check_for_sequence_classification(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = FNetForSequenceClassification(config)
-        model.set_train(False)
-        result = model(input_ids, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = FNetForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)
-        )
-
-    def create_and_check_for_multiple_choice(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_choices = self.num_choices
-        model = FNetForMultipleChoice(config=config)
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to(
-            (-1, self.num_choices, -1)
-        )
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to(
-            (-1, self.num_choices, -1)
-        )
-        result = model(
-            multiple_choice_inputs_ids,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_choices)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids}
-        return config, inputs_dict
-
-
-@require_mindspore
-class FNetModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            FNetModel,
-            FNetForPreTraining,
-            FNetForMaskedLM,
-            FNetForNextSentencePrediction,
-            FNetForMultipleChoice,
-            FNetForQuestionAnswering,
-            FNetForSequenceClassification,
-            FNetForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": FNetModel,
-            "fill-mask": FNetForMaskedLM,
-            "question-answering": FNetForQuestionAnswering,
-            "text-classification": FNetForSequenceClassification,
-            "token-classification": FNetForTokenClassification,
-            "zero-shot": FNetForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    # Skip Tests
-    test_pruning = False
-    test_head_masking = False
-    test_pruning = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_casse_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        processor_name,
-    ):
-        if (
-            pipeline_test_casse_name == "QAPipelineTests"
-            and not tokenizer_name.endswith("Fast")
-        ):
-            return True
-
-        return False
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(
-            inputs_dict, model_class, return_labels=return_labels
-        )
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=ms.int64,
-                )
-                inputs_dict["next_sentence_label"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=ms.int64
-                )
-        return inputs_dict
-
-    # Overriden Tests
-    @unittest.skip
-    def test_attention_outputs(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def set_nan_tensor_to_zero(t):
-            t[t != t] = 0
-            return t
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
-            dict_output = model(
-                **dict_inputs, return_dict=True, **additional_kwargs
-            ).to_tuple()
-
-            def recursive_check(tuple_object, dict_object):
-                if isinstance(tuple_object, (List, Tuple)):
-                    for tuple_iterable_value, dict_iterable_value in zip(
-                        tuple_object, dict_object
-                    ):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif isinstance(tuple_object, Dict):
-                    for tuple_iterable_value, dict_iterable_value in zip(
-                        tuple_object.values(), dict_object.values()
-                    ):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif tuple_object is None:
-                    return
-                else:
-                    self.assertTrue(
-                        np.allclose(
-                            set_nan_tensor_to_zero(tuple_object).asnumpy(),
-                            set_nan_tensor_to_zero(dict_object).asnumpy(),
-                            atol=1e-5,
-                        ),
-                        msg=(
-                            "Tuple and dict output are not equal. Difference:"
-                            f" {ops.max(ops.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                            f" {ops.isnan(tuple_object).any()} and `inf`: {ops.isinf(tuple_object)}. Dict has"
-                            f" `nan`: {ops.isnan(dict_object).any()} and `inf`: {ops.isinf(dict_object)}."
-                        ),
-                    )
-
-            recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True
-            )
-            dict_inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True
-            )
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            # tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            # dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            # check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True
-            )
-            dict_inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True
-            )
-            check_equivalence(
-                model, tuple_inputs, dict_inputs, {"output_hidden_states": True}
-            )
-
-    @unittest.skip("MindSpore has no retain grad")
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        inputs = self._prepare_for_class(inputs_dict, model_class)
-
-        outputs = model(**inputs)
-
-        output = outputs[0]
-
-        hidden_states = outputs.hidden_states[0]
-
-        hidden_states.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-
-    def setUp(self):
-        self.model_tester = FNetModelTester(self)
-        self.config_tester = FNetConfigTester(
-            self, config_class=FNetConfig, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(
-            *config_and_inputs
-        )
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/fnet-base"
-        model = FNetModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class FNetModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_for_masked_lm(self):
-        """
-        For comparison:
-        1. Modify the pre-training model `__call__` to skip computing metrics and return masked_lm_output like so:
-            ```
-            ...
-            sequence_output, pooled_output = EncoderModel(
-            self.config, random_seed=self.random_seed, name="encoder")(
-                input_ids, input_mask, type_ids, deterministic=deterministic)
-
-            masked_lm_output = nn.Dense(
-                self.config.d_emb,
-                kernel_init=default_kernel_init,
-                name="predictions_dense")(
-                    sequence_output)
-            masked_lm_output = nn.gelu(masked_lm_output)
-            masked_lm_output = nn.LayerNorm(
-                epsilon=LAYER_NORM_EPSILON, name="predictions_layer_norm")(
-                    masked_lm_output)
-            masked_lm_logits = layers.OutputProjection(
-                kernel=self._get_embedding_table(), name="predictions_output")(
-                    masked_lm_output)
-
-            next_sentence_logits = layers.OutputProjection(
-                n_out=2, kernel_init=default_kernel_init, name="classification")(
-                    pooled_output)
-
-            return masked_lm_logits
-            ...
-            ```
-        2. Run the following:
-            >>> import jax.numpy as jnp
-            >>> import sentencepiece as spm
-            >>> from flax.training import checkpoints
-            >>> from f_net.models import PreTrainingModel
-            >>> from f_net.configs.pretraining import get_config, ModelArchitecture
-
-            >>> pretrained_params = checkpoints.restore_checkpoint('./f_net/f_net_checkpoint', None) # Location of original checkpoint
-            >>> pretrained_config  = get_config()
-            >>> pretrained_config.model_arch = ModelArchitecture.F_NET
-
-            >>> vocab_filepath = "./f_net/c4_bpe_sentencepiece.model" # Location of the sentence piece model
-            >>> tokenizer = spm.SentencePieceProcessor()
-            >>> tokenizer.Load(vocab_filepath)
-            >>> with pretrained_config.unlocked():
-            >>>     pretrained_config.vocab_size = tokenizer.GetPieceSize()
-            >>> tokens = jnp.array([[0, 1, 2, 3, 4, 5]])
-            >>> type_ids = jnp.zeros_like(tokens, dtype="i4")
-            >>> attention_mask = jnp.ones_like(tokens) # Dummy. This gets deleted inside the model.
-
-            >>> flax_pretraining_model = PreTrainingModel(pretrained_config)
-            >>> pretrained_model_params = freeze(pretrained_params['target'])
-            >>> flax_model_outputs = flax_pretraining_model.apply({"params": pretrained_model_params}, tokens, attention_mask, type_ids, None, None, None, None, deterministic=True)
-            >>> masked_lm_logits[:, :3, :3]
-        """
-
-        model = FNetForMaskedLM.from_pretrained("google/fnet-base")
-
-        input_ids = ms.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        vocab_size = 32000
-
-        expected_shape = (1, 6, vocab_size)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = ms.tensor(
-            [
-                [
-                    [-1.7819, -7.7384, -7.5002],
-                    [-3.4746, -8.5943, -7.7762],
-                    [-3.2052, -9.0771, -8.3468],
-                ]
-            ],
-        )
-        self.assertTrue(
-            np.allclose(
-                output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-3
-            )
-        )
-
-    @slow
-    @require_tokenizers
-    def test_inference_long_sentence(self):
-        tokenizer = FNetTokenizerFast.from_pretrained("google/fnet-base")
-
-        inputs = tokenizer(
-            "the man worked as a [MASK].",
-            "this is his [MASK].",
-            return_tensors="ms",
-            padding="max_length",
-            max_length=512,
-        )
-
-        self.assertTrue(
-            np.allclose(inputs["input_ids"].asnumpy(), ms.tensor([[4, 13, 283, 2479, 106, 8, 6, 845, 5, 168, 65, 367, 6, 845, 5, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3]]).asnumpy()))  # fmt: skip
-
-        inputs = {k: v for k, v in inputs.items()}
-
-        model = FNetForMaskedLM.from_pretrained("google/fnet-base", from_pt=True)
-        logits = model(**inputs).logits
-        # predictions_mask_1 = tokenizer.decode(logits[0, 6].topk(5))
-        _, indices = ops.top_k(logits[0, 6], 5)
-        predictions_mask_1 = tokenizer.decode(indices)
-
-        _, indices = ops.top_k(logits[0, 12], 5)
-        predictions_mask_2 = tokenizer.decode(indices)
-
-        self.assertEqual(
-            predictions_mask_1.split(" "), ["man", "child", "teacher", "woman", "model"]
-        )
-        self.assertEqual(
-            predictions_mask_2.split(" "), ["work", "wife", "job", "story", "name"]
-        )
-
-    @slow
-    def test_inference_for_next_sentence_prediction(self):
-        model = FNetForNextSentencePrediction.from_pretrained(
-            "google/fnet-base", from_pt=True
-        )
-
-        input_ids = ms.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        expected_shape = (1, 2)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = ms.tensor([[-0.2234, -0.0226]])
-
-        self.assertTrue(
-            np.allclose(output.asnumpy(), expected_slice.asnumpy(), atol=1e-4)
-        )
-
-    @slow
-    def test_inference_model(self):
-        model = FNetModel.from_pretrained("google/fnet-base")
-
-        input_ids = ms.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        expected_shape = (1, 6, model.config.hidden_size)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = ms.tensor(
-            [
-                [
-                    [4.1541, -0.1051, -0.1667],
-                    [-0.9144, 0.2939, -0.0086],
-                    [-0.8472, -0.7281, 0.0256],
-                ]
-            ],
-        )
-
-        self.assertTrue(
-            np.allclose(
-                output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4
-            )
-        )
diff --git a/tests/transformers/models/focalnet/__init__.py b/tests/transformers/models/focalnet/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/focalnet/test_modeling_focalnet.py b/tests/transformers/models/focalnet/test_modeling_focalnet.py
deleted file mode 100644
index 17c3cb068..000000000
--- a/tests/transformers/models/focalnet/test_modeling_focalnet.py
+++ /dev/null
@@ -1,428 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Meta Platforms authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the Mindspore Focalnet model."""
-import pdb
-import collections
-import unittest
-import numpy as np
-
-from mindnlp.transformers import FocalNetConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_backbone_common import BackboneTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn
-
-    from mindnlp.transformers import (
-        FocalNetBackbone,
-        FocalNetForImageClassification,
-        FocalNetForMaskedImageModeling,
-        FocalNetModel,
-    )
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import AutoImageProcessor
-
-
-class FocalNetModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=32,
-        patch_size=2,
-        num_channels=3,
-        embed_dim=16,
-        hidden_sizes=[32, 64, 128],
-        depths=[1, 2, 1],
-        num_heads=[2, 2, 4],
-        window_size=2,
-        mlp_ratio=2.0,
-        qkv_bias=True,
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        drop_path_rate=0.1,
-        hidden_act="gelu",
-        use_absolute_embeddings=False,
-        patch_norm=True,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        is_training=True,
-        scope=None,
-        use_labels=True,
-        type_sequence_label_size=10,
-        encoder_stride=8,
-        out_features=["stage1", "stage2"],
-        out_indices=[1, 2],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.embed_dim = embed_dim
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.mlp_ratio = mlp_ratio
-        self.qkv_bias = qkv_bias
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.drop_path_rate = drop_path_rate
-        self.hidden_act = hidden_act
-        self.use_absolute_embeddings = use_absolute_embeddings
-        self.patch_norm = patch_norm
-        self.layer_norm_eps = layer_norm_eps
-        self.initializer_range = initializer_range
-        self.is_training = is_training
-        self.scope = scope
-        self.use_labels = use_labels
-        self.type_sequence_label_size = type_sequence_label_size
-        self.encoder_stride = encoder_stride
-        self.out_features = out_features
-        self.out_indices = out_indices
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return FocalNetConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            embed_dim=self.embed_dim,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            num_heads=self.num_heads,
-            window_size=self.window_size,
-            mlp_ratio=self.mlp_ratio,
-            qkv_bias=self.qkv_bias,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            drop_path_rate=self.drop_path_rate,
-            hidden_act=self.hidden_act,
-            use_absolute_embeddings=self.use_absolute_embeddings,
-            path_norm=self.patch_norm,
-            layer_norm_eps=self.layer_norm_eps,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-            out_features=self.out_features,
-            out_indices=self.out_indices,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = FocalNetModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        expected_seq_len = ((config.image_size // config.patch_size) ** 2) // (4 ** (len(config.depths) - 1))
-        expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim))
-
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = FocalNetBackbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.image_size, 8, 8])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-        self.parent.assertListEqual(model.channels, config.hidden_sizes[:-1])
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = FocalNetBackbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.image_size * 2, 4, 4])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-        self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
-
-    def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
-        model = FocalNetForMaskedImageModeling(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
-        )
-
-        # test greyscale images
-        config.num_channels = 1
-        model = FocalNetForMaskedImageModeling(config)
-        model.set_train(False)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = FocalNetForImageClassification(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = FocalNetForImageClassification(config)
-        model.set_train(False)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class FocalNetModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            FocalNetForImageClassification,
-            FocalNetForMaskedImageModeling,
-            FocalNetBackbone,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-
-    fx_compatible = False
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = FocalNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=FocalNetConfig, embed_dim=37, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_backbone(*config_and_inputs)
-
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @unittest.skip(reason="FocalNet does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="FocalNet does not use feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes[:-1]:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
-        model = model_class(config)
-        model.set_train(False)
-
-        outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-        hidden_states = outputs.hidden_states
-
-        expected_num_layers = getattr(
-            self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
-        )
-        self.assertEqual(len(hidden_states), expected_num_layers)
-
-        # FocalNet has a different seq_length
-        patch_size = (
-            config.patch_size
-            if isinstance(config.patch_size, collections.abc.Iterable)
-            else (config.patch_size, config.patch_size)
-        )
-
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-
-        self.assertListEqual(
-            list(hidden_states[0].shape[-2:]),
-            [num_patches, self.model_tester.embed_dim],
-        )
-
-        reshaped_hidden_states = outputs.reshaped_hidden_states
-        self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
-
-        batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
-        reshaped_hidden_states = (
-            reshaped_hidden_states[0].view(batch_size, num_channels, height * width).permute(0, 2, 1)
-        )
-        self.assertListEqual(
-            list(reshaped_hidden_states.shape[-2:]),
-            [num_patches, self.model_tester.embed_dim],
-        )
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        image_size = (
-            self.model_tester.image_size
-            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
-            else (self.model_tester.image_size, self.model_tester.image_size)
-        )
-
-        for model_class in self.all_model_classes[:-1]:
-            inputs_dict["output_hidden_states"] = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-    def test_hidden_states_output_with_padding(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.patch_size = 3
-
-        image_size = (
-            self.model_tester.image_size
-            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
-            else (self.model_tester.image_size, self.model_tester.image_size)
-        )
-        patch_size = (
-            config.patch_size
-            if isinstance(config.patch_size, collections.abc.Iterable)
-            else (config.patch_size, config.patch_size)
-        )
-
-        padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0])
-        padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1])
-
-        for model_class in self.all_model_classes[:-1]:
-            inputs_dict["output_hidden_states"] = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/focalnet-tiny"
-        model = FocalNetModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "embeddings" not in name and param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-
-@require_vision
-@require_mindspore
-class FocalNetModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        # TODO update organization
-        return AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = FocalNetForImageClassification.from_pretrained("microsoft/focalnet-tiny")
-        image_processor = self.default_image_processor
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = ((1, 1000))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        print("my:",outputs.logits[0, :3].asnumpy())
-        expected_slice = mindspore.tensor([0.2166, -0.4368, 0.2191])
-        self.assertTrue(np.allclose(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-3))
-        self.assertTrue(outputs.logits.argmax(axis=-1).item(), 281)
-
-
-@require_mindspore
-class FocalNetBackboneTest(BackboneTesterMixin, unittest.TestCase):
-    all_model_classes = (FocalNetBackbone,) if is_mindspore_available() else ()
-    config_class = FocalNetConfig
-
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = FocalNetModelTester(self)
diff --git a/tests/transformers/models/fsmt/__init__.py b/tests/transformers/models/fsmt/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/fsmt/test_modeling_fsmt.py b/tests/transformers/models/fsmt/test_modeling_fsmt.py
deleted file mode 100644
index 8d7980901..000000000
--- a/tests/transformers/models/fsmt/test_modeling_fsmt.py
+++ /dev/null
@@ -1,616 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Huggingface
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-import unittest
-
-# import timeout_decorator  # noqa
-from parameterized import parameterized
-
-import numpy as np
-
-from mindnlp.transformers import FSMTConfig
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-from mindnlp.utils import cached_property, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import FSMTForConditionalGeneration, FSMTModel, FSMTTokenizer
-    from mindnlp.transformers.models.fsmt.modeling_fsmt import (
-        SinusoidalPositionalEmbedding,
-        _prepare_fsmt_decoder_inputs,
-        invert_mask,
-        shift_tokens_right,
-    )
-    # from mindnlp.transformers.pipelines import TranslationPipeline
-
-
-class FSMTModelTester:
-    def __init__(
-        self,
-        parent,
-        src_vocab_size=99,
-        tgt_vocab_size=99,
-        langs=["ru", "en"],
-        batch_size=13,
-        seq_length=7,
-        is_training=False,
-        use_labels=False,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="relu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        bos_token_id=0,
-        pad_token_id=1,
-        eos_token_id=2,
-    ):
-        self.parent = parent
-        self.src_vocab_size = src_vocab_size
-        self.tgt_vocab_size = tgt_vocab_size
-        self.langs = langs
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.eos_token_id = eos_token_id
-        # torch.manual_seed(0)
-        mindspore.set_seed(0)
-
-        # hack needed for modeling_common tests - despite not really having this attribute in this model
-        self.vocab_size = self.src_vocab_size
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.src_vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = 2  # Eos Token
-
-        config = self.get_config()
-        inputs_dict = prepare_fsmt_inputs_dict(config, input_ids)
-        return config, inputs_dict
-
-    def get_config(self):
-        return FSMTConfig(
-            vocab_size=self.src_vocab_size,  # hack needed for common tests
-            src_vocab_size=self.src_vocab_size,
-            tgt_vocab_size=self.tgt_vocab_size,
-            langs=self.langs,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        inputs_dict["decoder_input_ids"] = inputs_dict["input_ids"]
-        inputs_dict["decoder_attention_mask"] = inputs_dict["attention_mask"]
-        inputs_dict["use_cache"] = False
-        return config, inputs_dict
-
-
-def prepare_fsmt_inputs_dict(
-    config,
-    input_ids,
-    attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(config.encoder_layers, config.encoder_attention_heads)
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    return {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-    }
-
-
-@require_mindspore
-class FSMTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (FSMTModel, FSMTForConditionalGeneration) if is_mindspore_available() else ()
-    all_generative_model_classes = (FSMTForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": FSMTModel,
-            "summarization": FSMTForConditionalGeneration,
-            "text2text-generation": FSMTForConditionalGeneration,
-            "translation": FSMTForConditionalGeneration,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_pruning = False
-    test_missing_keys = False
-
-    def setUp(self):
-        self.model_tester = FSMTModelTester(self)
-        self.langs = ["en", "ru"]
-        config = {
-            "langs": self.langs,
-            "src_vocab_size": 10,
-            "tgt_vocab_size": 20,
-        }
-        # XXX: hack to appease to all other models requiring `vocab_size`
-        config["vocab_size"] = 99  # no such thing in FSMT
-        self.config_tester = ConfigTester(self, config_class=FSMTConfig, **config)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # XXX: override test_model_get_set_embeddings / different Embedding type
-    def test_model_get_set_embeddings(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Embedding))
-            model.set_input_embeddings(nn.Embedding(10, 10))
-            x = model.get_output_embeddings()
-            # self.assertTrue(x is None or isinstance(x, nn.modules.sparse.Embedding))
-            self.assertTrue(x is None or isinstance(x, nn.Embedding))
-
-    def test_initialization_more(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        model = FSMTModel(config)
-        model.eval()
-        # test init
-        # self.assertTrue((model.encoder.embed_tokens.weight == model.shared.weight).all().item())
-
-        def _check_var(module):
-            """Check that we initialized various parameters from N(0, config.init_std)."""
-            self.assertAlmostEqual(ops.std(module.weight).item(), config.init_std, 2)
-
-        _check_var(model.encoder.embed_tokens)
-        _check_var(model.encoder.layers[0].self_attn.k_proj)
-        _check_var(model.encoder.layers[0].fc1)
-        # XXX: different std for fairseq version of SinusoidalPositionalEmbedding
-        # self.assertAlmostEqual(torch.std(model.encoder.embed_positions.weights).item(), config.init_std, 2)
-
-    def test_advanced_inputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        config.use_cache = False
-        inputs_dict["input_ids"][:, -2:] = config.pad_token_id
-        decoder_input_ids, decoder_attn_mask, causal_mask = _prepare_fsmt_decoder_inputs(
-            config, inputs_dict["input_ids"]
-        )
-        model = FSMTModel(config).eval()
-
-        decoder_features_with_created_mask = model(**inputs_dict)[0]
-        decoder_features_with_passed_mask = model(
-            decoder_attention_mask=invert_mask(decoder_attn_mask), decoder_input_ids=decoder_input_ids, **inputs_dict
-        )[0]
-        _assert_tensors_equal(decoder_features_with_passed_mask, decoder_features_with_created_mask)
-        useless_mask = ops.zeros_like(decoder_attn_mask)
-        decoder_features = model(decoder_attention_mask=useless_mask, **inputs_dict)[0]
-        self.assertTrue(isinstance(decoder_features, mindspore.Tensor))  # no hidden states or attentions
-        self.assertEqual(
-            decoder_features.shape,
-            (self.model_tester.batch_size, self.model_tester.seq_length, config.tgt_vocab_size),
-        )
-        # if decoder_attn_mask.min().item() < -1e3:  # some tokens were masked
-        #     self.assertFalse((decoder_features_with_created_mask == decoder_features).all().item())
-
-        # Test different encoder attention masks
-        decoder_features_with_long_encoder_mask = model(
-            inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"].long()
-        )[0]
-        _assert_tensors_equal(decoder_features_with_long_encoder_mask, decoder_features_with_created_mask)
-
-    def test_save_load_missing_keys(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    # @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
-    # def test_export_to_onnx(self):
-    #     config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-    #     model = FSMTModel(config)
-    #     with tempfile.TemporaryDirectory() as tmpdirname:
-    #         torch.onnx.export(
-    #             model,
-    #             (inputs_dict["input_ids"], inputs_dict["attention_mask"]),
-    #             f"{tmpdirname}/fsmt_test.onnx",
-    #             export_params=True,
-    #             opset_version=12,
-    #             input_names=["input_ids", "attention_mask"],
-    #         )
-
-    def test_ensure_weights_are_shared(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-
-        config.tie_word_embeddings = True
-        model = FSMTForConditionalGeneration(config)
-
-        # FSMT shares three weights.
-        # Not an issue to not have these correctly tied for torch.load, but it is an issue for safetensors.
-
-        # self.assertEqual(
-        #     len(
-        #         {
-        #             model.get_output_embeddings().weight.data_ptr(),
-        #             model.get_input_embeddings().weight.data_ptr(),
-        #             model.base_model.decoder.output_projection.weight.data_ptr(),
-        #         }
-        #     ),
-        #     1,
-        # )
-
-        config.tie_word_embeddings = False
-        model = FSMTForConditionalGeneration(config)
-
-        # FSMT shares three weights.
-        # Not an issue to not have these correctly tied for torch.load, but it is an issue for safetensors.
-
-        # self.assertEqual(
-        #     len(
-        #         {
-        #             model.get_output_embeddings().weight.data_ptr(),
-        #             model.get_input_embeddings().weight.data_ptr(),
-        #             model.base_model.decoder.output_projection.weight.data_ptr(),
-        #         }
-        #     ),
-        #     2,
-        # )
-
-    @unittest.skip(reason="can't be implemented for FSMT due to dual vocab.")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Passing inputs_embeds not implemented for FSMT.")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Input ids is required for FSMT.")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(reason="model weights aren't tied in FSMT.")
-    def test_tie_model_weights(self):
-        pass
-
-    @unittest.skip(reason="TODO: Decoder embeddings cannot be resized at the moment")
-    def test_resize_embeddings_untied(self):
-        pass
-
-    @unittest.skip(reason="")
-    def test_model_get_set_embeddings(self):
-        pass
-
-
-@require_mindspore
-class FSMTHeadTests(unittest.TestCase):
-    src_vocab_size = 99
-    tgt_vocab_size = 99
-    langs = ["ru", "en"]
-
-    def _get_config(self):
-        return FSMTConfig(
-            src_vocab_size=self.src_vocab_size,
-            tgt_vocab_size=self.tgt_vocab_size,
-            langs=self.langs,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-        )
-
-    def _get_config_and_data(self):
-        input_ids = mindspore.tensor(
-            [
-                [71, 82, 18, 33, 46, 91, 2],
-                [68, 34, 26, 58, 30, 82, 2],
-                [5, 97, 17, 39, 94, 40, 2],
-                [76, 83, 94, 25, 70, 78, 2],
-                [87, 59, 41, 35, 48, 66, 2],
-                [55, 13, 16, 58, 5, 2, 1],  # note padding
-                [64, 27, 31, 51, 12, 75, 2],
-                [52, 64, 86, 17, 83, 39, 2],
-                [48, 61, 9, 24, 71, 82, 2],
-                [26, 1, 60, 48, 22, 13, 2],
-                [21, 5, 62, 28, 14, 76, 2],
-                [45, 98, 37, 86, 59, 48, 2],
-                [70, 70, 50, 9, 28, 0, 2],
-            ],
-            dtype = mindspore.int64,
-        )
-
-        batch_size = input_ids.shape[0]
-        config = self._get_config()
-        return config, input_ids, batch_size
-
-    def test_generate_beam_search(self):
-        input_ids = mindspore.tensor([[71, 82, 2], [68, 34, 2]], dtype=mindspore.int64)
-        config = self._get_config()
-        lm_model = FSMTForConditionalGeneration(config)
-        lm_model.eval()
-
-        max_length = 5
-        new_input_ids = lm_model.generate(
-            input_ids.copy(),
-            do_sample=True,
-            num_return_sequences=1,
-            num_beams=2,
-            no_repeat_ngram_size=3,
-            max_length=max_length,
-        )
-        self.assertEqual(new_input_ids.shape, (input_ids.shape[0], max_length))
-
-    def test_shift_tokens_right(self):
-        input_ids = mindspore.tensor([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]], dtype=mindspore.int64)
-        shifted = shift_tokens_right(input_ids, 1)
-        n_pad_before = input_ids.eq(1).float().sum()
-        n_pad_after = shifted.eq(1).float().sum()
-        self.assertEqual(shifted.shape, input_ids.shape)
-        self.assertEqual(n_pad_after, n_pad_before - 1)
-        self.assertTrue(ops.eq(shifted[:, 0], 2).all())
-
-    @require_mindspore
-    def test_generate_fp16(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        attention_mask = input_ids.ne(1)
-        model = FSMTForConditionalGeneration(config).eval()
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    def test_dummy_inputs(self):
-        config, *_ = self._get_config_and_data()
-        model = FSMTForConditionalGeneration(config).eval()
-        model(**model.dummy_inputs)
-
-    def test_prepare_fsmt_decoder_inputs(self):
-        config, *_ = self._get_config_and_data()
-        input_ids = _long_tensor(([4, 4, 2]))
-        decoder_input_ids = _long_tensor([[26388, 2, config.pad_token_id]])
-        causal_mask_dtype = mindspore.float32
-        ignore = np.finfo(mindspore.dtype_to_nptype(causal_mask_dtype)).min
-        decoder_input_ids, decoder_attn_mask, causal_mask = _prepare_fsmt_decoder_inputs(
-            config, input_ids, decoder_input_ids, causal_mask_dtype=causal_mask_dtype
-        )
-        expected_causal_mask = mindspore.tensor(
-            [[0, ignore, ignore], [0, 0, ignore], [0, 0, 0]]  # never attend to the final token, because its pad
-        )
-        self.assertEqual(decoder_attn_mask.shape, decoder_input_ids.shape)
-        self.assertTrue(ops.eq(expected_causal_mask, causal_mask).all())
-
-
-def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
-    """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if np.allclose(a.numpy(), b.numpy(), atol=atol):
-            return True
-        raise
-    except Exception:
-        if len(prefix) > 0:
-            prefix = f"{prefix}: "
-        raise AssertionError(f"{prefix}{a} != {b}")
-
-
-def _long_tensor(tok_lst):
-    return mindspore.tensor(tok_lst, dtype=mindspore.int64)
-
-
-TOLERANCE = 1e-4
-
-
-pairs = [
-    ["en-ru"],
-    ["ru-en"],
-    ["en-de"],
-    ["de-en"],
-]
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class FSMTModelIntegrationTests(unittest.TestCase):
-    tokenizers_cache = {}
-    models_cache = {}
-    default_mname = "facebook/wmt19-en-ru"
-
-    @cached_property
-    def default_tokenizer(self):
-        return self.get_tokenizer(self.default_mname)
-
-    @cached_property
-    def default_model(self):
-        return self.get_model(self.default_mname)
-
-    def get_tokenizer(self, mname):
-        if mname not in self.tokenizers_cache:
-            self.tokenizers_cache[mname] = FSMTTokenizer.from_pretrained(mname)
-        return self.tokenizers_cache[mname]
-
-    def get_model(self, mname):
-        if mname not in self.models_cache:
-            self.models_cache[mname] = FSMTForConditionalGeneration.from_pretrained(mname)
-            # if torch_device == "cuda":
-            #     self.models_cache[mname].half()
-            self.models_cache[mname].half()
-        return self.models_cache[mname]
-
-    @slow
-    def test_inference_no_head(self):
-        tokenizer = self.default_tokenizer
-        model = FSMTModel.from_pretrained(self.default_mname)
-
-        src_text = "My friend computer will translate this for me"
-        input_ids = tokenizer([src_text], return_tensors="ms")["input_ids"]
-        input_ids = _long_tensor(input_ids)
-        inputs_dict = prepare_fsmt_inputs_dict(model.config, input_ids)
-        with mindspore._no_grad():
-            output = model(**inputs_dict)[0]
-        expected_shape = (1, 10, model.config.tgt_vocab_size)
-        self.assertEqual(output.shape, expected_shape)
-        # expected numbers were generated when en-ru model, using just fairseq's model4.pt
-        # may have to adjust if switched to a different checkpoint
-        expected_slice = mindspore.tensor(
-            [[-1.5753, -1.5753, 2.8975], [-0.9540, -0.9540, 1.0299], [-3.3131, -3.3131, 0.5219]]
-        )
-        self.assertTrue(np.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=TOLERANCE))
-
-    def translation_setup(self, pair):
-        text = {
-            "en": "Machine learning is great, isn't it?",
-            "ru": "Машинное обучение - это здорово, не так ли?",
-            "de": "Maschinelles Lernen ist großartig, oder?",
-        }
-
-        src, tgt = pair.split("-")
-        print(f"Testing {src} -> {tgt}")
-        mname = f"facebook/wmt19-{pair}"
-
-        src_text = text[src]
-        tgt_text = text[tgt]
-
-        tokenizer = self.get_tokenizer(mname)
-        model = self.get_model(mname)
-        return tokenizer, model, src_text, tgt_text
-
-    @parameterized.expand(pairs)
-    @slow
-    def test_translation_direct(self, pair):
-        tokenizer, model, src_text, tgt_text = self.translation_setup(pair)
-
-        input_ids = tokenizer.encode(src_text, return_tensors="ms")
-
-        outputs = model.generate(input_ids)
-        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        assert decoded == tgt_text, f"\n\ngot: {decoded}\nexp: {tgt_text}\n"
-
-    # @parameterized.expand(pairs)
-    # @slow
-    # def test_translation_pipeline(self, pair):
-    #     tokenizer, model, src_text, tgt_text = self.translation_setup(pair)
-    #     pipeline = TranslationPipeline(model, tokenizer, framework="ms")
-    #     output = pipeline([src_text])
-    #     self.assertEqual([tgt_text], [x["translation_text"] for x in output])
-
-
-@require_mindspore
-class TestSinusoidalPositionalEmbeddings(unittest.TestCase):
-    padding_idx = 1
-    tolerance = 1e-4
-
-    def test_basic(self):
-        input_ids = mindspore.tensor([[4, 10]], dtype=mindspore.int64)
-        emb1 = SinusoidalPositionalEmbedding(num_positions=6, embedding_dim=6, padding_idx=self.padding_idx)
-        emb = emb1(input_ids)
-        desired_weights = mindspore.tensor(
-            [
-                [9.0930e-01, 1.9999e-02, 2.0000e-04, -4.1615e-01, 9.9980e-01, 1.0000e00],
-                [1.4112e-01, 2.9995e-02, 3.0000e-04, -9.8999e-01, 9.9955e-01, 1.0000e00],
-            ]
-        )
-        self.assertTrue(
-            np.allclose(emb[0].numpy(), desired_weights.numpy(), atol=self.tolerance),
-            msg=f"\nexp:\n{desired_weights}\ngot:\n{emb[0]}\n",
-        )
-
-    def test_odd_embed_dim(self):
-        # odd embedding_dim  is allowed
-        SinusoidalPositionalEmbedding(num_positions=4, embedding_dim=5, padding_idx=self.padding_idx)
-
-        # odd num_embeddings is allowed
-        SinusoidalPositionalEmbedding(num_positions=5, embedding_dim=4, padding_idx=self.padding_idx)
-
-    @unittest.skip(reason="different from marian (needs more research)")
-    def test_positional_emb_weights_against_marian(self):
-        desired_weights = mindspore.tensor(
-            [
-                [0, 0, 0, 0, 0],
-                [0.84147096, 0.82177866, 0.80180490, 0.78165019, 0.76140374],
-                [0.90929741, 0.93651021, 0.95829457, 0.97505713, 0.98720258],
-            ]
-        )
-        emb1 = SinusoidalPositionalEmbedding(num_positions=512, embedding_dim=512, padding_idx=self.padding_idx)
-        weights = emb1.weights.data[:3, :5]
-        # XXX: only the 1st and 3rd lines match - this is testing against
-        # verbatim copy of SinusoidalPositionalEmbedding from fairseq
-        self.assertTrue(
-            np.allclose(weights.numpy(), desired_weights.numpy(), atol=self.tolerance),
-            msg=f"\nexp:\n{desired_weights}\ngot:\n{weights}\n",
-        )
-
-        # test that forward pass is just a lookup, there is no ignore padding logic
-        input_ids = mindspore.tensor(
-            [[4, 10, self.padding_idx, self.padding_idx, self.padding_idx]], dtype=mindspore.int64
-        )
-        no_cache_pad_zero = emb1(input_ids)[0]
-        # XXX: only the 1st line matches the 3rd
-        self.assertTrue(
-            np.allclose(mindspore.tensor(desired_weights).numpy(), no_cache_pad_zero[:3, :5].numpy(), atol=1e-3)
-        )
diff --git a/tests/transformers/models/funnel/__init__.py b/tests/transformers/models/funnel/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/funnel/test_modeling_funnel.py b/tests/transformers/models/funnel/test_modeling_funnel.py
deleted file mode 100644
index ac8885811..000000000
--- a/tests/transformers/models/funnel/test_modeling_funnel.py
+++ /dev/null
@@ -1,518 +0,0 @@
-# coding=utf-8
-# Copyright 2020 HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-import numpy as np
-
-from mindnlp.transformers import FunnelConfig, FunnelTokenizer
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import require_sentencepiece, require_tokenizers, require_mindspore, slow, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        FunnelBaseModel,
-        FunnelForMaskedLM,
-        FunnelForMultipleChoice,
-        FunnelForPreTraining,
-        FunnelForQuestionAnswering,
-        FunnelForSequenceClassification,
-        FunnelForTokenClassification,
-        FunnelModel,
-    )
-    from mindnlp.transformers.models.funnel.modeling_funnel import FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
-class FunnelModelTester:
-    """You can also import this e.g, from .test_modeling_funnel import FunnelModelTester"""
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        block_sizes=[1, 1, 2],
-        num_decoder_layers=1,
-        d_model=32,
-        n_head=4,
-        d_head=8,
-        d_inner=37,
-        hidden_act="gelu_new",
-        hidden_dropout=0.1,
-        attention_dropout=0.1,
-        activation_dropout=0.0,
-        max_position_embeddings=512,
-        type_vocab_size=3,
-        initializer_std=0.02,  # Set to a smaller value, so we can keep the small error threshold (1e-5) in the test
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        base=False,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.block_sizes = block_sizes
-        self.num_decoder_layers = num_decoder_layers
-        self.d_model = d_model
-        self.n_head = n_head
-        self.d_head = d_head
-        self.d_inner = d_inner
-        self.hidden_act = hidden_act
-        self.hidden_dropout = hidden_dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = 2
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.initializer_std = initializer_std
-
-        # Used in the tests to check the size of the first attention layer
-        self.num_attention_heads = n_head
-        # Used in the tests to check the size of the first hidden state
-        self.hidden_size = self.d_model
-        # Used in the tests to check the number of output hidden states/attentions
-        self.num_hidden_layers = sum(self.block_sizes) + (0 if base else self.num_decoder_layers)
-        # FunnelModel adds two hidden layers: input embeddings and the sum of the upsampled encoder hidden state with
-        # the last hidden state of the first block (which is the first hidden state of the decoder).
-        if not base:
-            self.expected_num_hidden_layers = self.num_hidden_layers + 2
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size).astype(mindspore.int32)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels).astype(mindspore.int32)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices).astype(mindspore.int32)
-            fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1).astype(mindspore.int32)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            fake_token_labels,
-        )
-
-    def get_config(self):
-        return FunnelConfig(
-            vocab_size=self.vocab_size,
-            block_sizes=self.block_sizes,
-            num_decoder_layers=self.num_decoder_layers,
-            d_model=self.d_model,
-            n_head=self.n_head,
-            d_head=self.d_head,
-            d_inner=self.d_inner,
-            hidden_act=self.hidden_act,
-            hidden_dropout=self.hidden_dropout,
-            attention_dropout=self.attention_dropout,
-            activation_dropout=self.activation_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_std=self.initializer_std,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        model = FunnelModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model))
-
-        model.config.truncate_seq = False
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model))
-
-        model.config.separate_cls = False
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.d_model))
-
-    def create_and_check_base_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        model = FunnelBaseModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 2, self.d_model))
-
-        model.config.truncate_seq = False
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 3, self.d_model))
-
-        model.config.separate_cls = False
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, 2, self.d_model))
-
-    def create_and_check_for_pretraining(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = FunnelForPreTraining(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_masked_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        model = FunnelForMaskedLM(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_sequence_classification(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = FunnelForSequenceClassification(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        config.num_choices = self.num_choices
-        model = FunnelForMultipleChoice(config=config)
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_token_classification(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = FunnelForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        fake_token_labels,
-    ):
-        model = FunnelForQuestionAnswering(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            fake_token_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class FunnelModelTest(ModelTesterMixin, unittest.TestCase):
-    test_head_masking = False
-    test_pruning = False
-    all_model_classes = (
-        (
-            FunnelModel,
-            FunnelForMaskedLM,
-            FunnelForPreTraining,
-            FunnelForQuestionAnswering,
-            FunnelForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": (FunnelBaseModel, FunnelModel),
-            "fill-mask": FunnelForMaskedLM,
-            "question-answering": FunnelForQuestionAnswering,
-            "text-classification": FunnelForSequenceClassification,
-            "token-classification": FunnelForTokenClassification,
-            "zero-shot": FunnelForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int32
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = FunnelModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=FunnelConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-
-        for param in ["r_w_bias", "r_r_bias", "r_kernel", "r_s_bias", "seg_embed"]:
-            if hasattr(module, param) and getattr(module, param) is not None:
-                weight = getattr(module, param)
-                weight.data.fill_(3)
-
-
-@require_mindspore
-class FunnelBaseModelTest(ModelTesterMixin, unittest.TestCase):
-    test_head_masking = False
-    test_pruning = False
-    all_model_classes = (
-        (FunnelBaseModel, FunnelForMultipleChoice, FunnelForSequenceClassification) if is_mindspore_available() else ()
-    )
-
-    def setUp(self):
-        self.model_tester = FunnelModelTester(self, base=True)
-        self.config_tester = ConfigTester(self, config_class=FunnelConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_base_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_base_model(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    # overwrite from test_modeling_common
-    def test_training(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == "FunnelBaseModel":
-                continue
-            model = model_class(config)
-            model.set_train(False)
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            # loss.backward()
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-
-        for param in ["r_w_bias", "r_r_bias", "r_kernel", "r_s_bias", "seg_embed"]:
-            if hasattr(module, param) and getattr(module, param) is not None:
-                weight = getattr(module, param)
-                weight.data.fill_(3)
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class FunnelModelIntegrationTest(unittest.TestCase):
-    def test_inference_tiny_model(self):
-        batch_size = 13
-        sequence_length = 7
-        input_ids = ops.arange(0, batch_size * sequence_length).long().reshape(batch_size, sequence_length)
-        lengths = [0, 1, 2, 3, 4, 5, 6, 4, 1, 3, 5, 0, 1]
-        token_type_ids = mindspore.tensor([[2] + [0] * a + [1] * (sequence_length - a - 1) for a in lengths])
-
-        model = FunnelModel.from_pretrained("sgugger/funnel-random-tiny")
-        output = model(input_ids, token_type_ids=token_type_ids)[0].abs()
-
-        expected_output_sum = mindspore.tensor(2344.8352)
-        expected_output_mean = mindspore.tensor(0.8052)
-        self.assertTrue(np.allclose(output.asnumpy().sum(), expected_output_sum.asnumpy(), atol=1e-4))
-        self.assertTrue(np.allclose(output.asnumpy().mean(), expected_output_mean.asnumpy(), atol=1e-4))
-
-        attention_mask = mindspore.tensor([[1] * 7, [1] * 4 + [0] * 3] * 6 + [[0, 1, 1, 0, 0, 1, 1]])
-        output = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[0].abs()
-
-        expected_output_sum = mindspore.tensor(2343.8425)
-        expected_output_mean = mindspore.tensor(0.8049)
-        self.assertTrue(np.allclose(output.asnumpy().sum(), expected_output_sum.asnumpy(), atol=1e-4))
-        self.assertTrue(np.allclose(output.asnumpy().mean(), expected_output_mean.asnumpy(), atol=1e-4))
-
-    @slow
-    def test_inference_model(self):
-        tokenizer = FunnelTokenizer.from_pretrained("huggingface/funnel-small")
-        model = FunnelModel.from_pretrained("huggingface/funnel-small")
-        inputs = tokenizer("Hello! I am the Funnel Transformer model.", return_tensors="ms")
-        output = model(**inputs)[0]
-
-        expected_output_sum = mindspore.tensor(235.7246)
-        expected_output_mean = mindspore.tensor(0.0256)
-        self.assertTrue(np.allclose(output.asnumpy().sum(), expected_output_sum.asnumpy(), atol=1e-4))
-        self.assertTrue(np.allclose(output.asnumpy().mean(), expected_output_mean.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/funnel/test_tokenization_funnel.py b/tests/transformers/models/funnel/test_tokenization_funnel.py
deleted file mode 100644
index 9b31f9f55..000000000
--- a/tests/transformers/models/funnel/test_tokenization_funnel.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# coding=utf-8
-# Copyright 2020 HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import unittest
-import mindspore
-from mindnlp.transformers import FunnelTokenizer, FunnelTokenizerFast
-from mindnlp.transformers.models.funnel.tokenization_funnel import VOCAB_FILES_NAMES
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "funnel-transformer/small"
-    tokenizer_class = FunnelTokenizer
-    rust_tokenizer_class = FunnelTokenizerFast
-    test_rust_tokenizer = True
-    space_between_special_tokens = True
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = [
-            "<unk>",
-            "<cls>",
-            "<sep>",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_tokenizer(self, **kwargs):
-        return FunnelTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        return FunnelTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00e9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
-
-    def test_token_type_ids(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            inputs = tokenizer("UNwant\u00e9d,running")
-            sentence_len = len(inputs["input_ids"]) - 1
-            self.assertListEqual(inputs["token_type_ids"], [2] + [0] * sentence_len)
-
-            inputs = tokenizer("UNwant\u00e9d,running", "UNwant\u00e9d,running")
-            self.assertListEqual(inputs["token_type_ids"], [2] + [0] * sentence_len + [1] * sentence_len)
diff --git a/tests/transformers/models/fuyu/__init__.py b/tests/transformers/models/fuyu/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/fuyu/test_modeling_fuyu.py b/tests/transformers/models/fuyu/test_modeling_fuyu.py
deleted file mode 100644
index 16f9b7215..000000000
--- a/tests/transformers/models/fuyu/test_modeling_fuyu.py
+++ /dev/null
@@ -1,402 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the Mindspore Fuyu model."""
-
-import io
-import unittest
-
-import requests
-
-from mindnlp.transformers import FuyuConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_vision_available():
-    from PIL import Image
-
-
-if is_mindspore_available() and is_vision_available():
-    from mindnlp.transformers import FuyuProcessor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import FuyuForCausalLM
-
-
-class FuyuModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        image_size=30,
-        patch_size=15,
-        num_channels=3,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        token_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, sequence_labels, token_labels
-
-    def get_config(self):
-        return FuyuConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-    ):
-        model = FuyuForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = FuyuForCausalLM(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = FuyuForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = FuyuForCausalLM(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        import numpy as np
-        self.parent.assertTrue(np.allclose(output_from_past_slice.numpy(), output_from_no_past_slice.numpy(), atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class FuyuModelTest(ModelTesterMixin, unittest.TestCase):
-# class FuyuModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (FuyuForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"text-generation": FuyuForCausalLM} if is_mindspore_available() else {}
-
-    test_head_masking = False
-    test_pruning = False
-    test_cpu_offload = False
-    test_disk_offload = False
-    test_model_parallel = False
-
-    def setUp(self):
-        self.model_tester = FuyuModelTester(self)
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    # TODO: Fix me (once this model gets more usage)
-    @unittest.skip(reason="Does not work on the tiny model.")
-    def test_disk_offload_bin(self):
-        super().test_disk_offload()
-
-    # TODO: Fix me (once this model gets more usage)
-    @unittest.skip(reason="Does not work on the tiny model.")
-    def test_disk_offload_safetensors(self):
-        super().test_disk_offload()
-
-    # TODO: Fix me (once this model gets more usage)
-    @unittest.skip(reason="Does not work on the tiny model.")
-    def test_model_parallelism(self):
-        super().test_model_parallelism()
-
-
-@slow
-@require_mindspore
-class FuyuModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_processor(self):
-        return FuyuProcessor.from_pretrained("adept/fuyu-8b")
-
-    @cached_property
-    def default_model(self):
-        return FuyuForCausalLM.from_pretrained("adept/fuyu-8b")
-
-    def test_greedy_generation(self):
-        processor = self.default_processor
-        model = self.default_model
-
-        url = "https://hf-mirror.com/adept/fuyu-8b/resolve/main/bus.png"
-        image = Image.open(io.BytesIO(requests.get(url).content))
-        # image = Image.open("/home/wuzhirong/new_mindspore/mindnlp_fuyu/tests/ut/transformers/models/fuyu/bus.png")
-
-        text_prompt_coco_captioning = "Generate a coco-style caption.\n"
-
-        inputs = processor(text=text_prompt_coco_captioning, images=image, return_tensors="ms")
-        generated_ids = model.generate(**inputs, max_new_tokens=10)
-
-        # take the last 8 tokens (in order to skip special \n\x04 characters) and decode them
-        generated_text = processor.batch_decode(generated_ids[:, -8:], skip_special_tokens=True)[0]
-        self.assertEqual(generated_text, "A blue bus parked on the side of a road.")
-
-
-"""
-    @slow
-    @require_torch_accelerator
-    def test_model_8b_chat_greedy_generation_bus_color(self):
-        EXPECTED_TEXT_COMPLETION = "The bus is blue.\n|ENDOFTEXT|"
-        text_prompt_bus_color = "What color is the bus?\n"
-        model_inputs_bus_color = self.processor(text=text_prompt_bus_color, images=self.bus_image_pil)
-
-        generated_tokens = self.model.generate(**model_inputs_bus_color, max_new_tokens=10)
-        text = self.processor.tokenizer.batch_decode(generated_tokens)
-        end_sequence = text[0].split("\x04")[1]
-        clean_sequence = (
-            end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")]
-            if "|ENDOFTEXT|" in end_sequence
-            else end_sequence
-        )
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, clean_sequence)
-
-    @slow
-    @require_torch_accelerator
-    def test_model_8b_chat_greedy_generation_chart_vqa(self):
-        EXPECTED_TEXT_TOKENS = ["The","life expectancy","at","birth","of male","s in","","20","18","is","","80",".","7",".","\n","|ENDOFTEXT|",]  # fmt: skip
-        expected_text_completion = " ".join(EXPECTED_TEXT_TOKENS)  # TODO make sure the end string matches
-
-        text_prompt_chart_vqa = "What is the highest life expectancy at birth of male?\n"
-
-        chart_image_url = (
-            "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/chart.png"
-        )
-        chart_image_pil = Image.open(io.BytesIO(requests.get(chart_image_url).content))
-
-        model_inputs_chart_vqa = self.processor(text=text_prompt_chart_vqa, images=chart_image_pil)
-        generated_tokens = self.model.generate(**model_inputs_chart_vqa, max_new_tokens=10)
-        text = self.processor.tokenizer.batch_decode(generated_tokens)
-        end_sequence = text[0].split("\x04")[1]
-        clean_sequence = (
-            end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")]
-            if "|ENDOFTEXT|" in end_sequence
-            else end_sequence
-        )
-        self.assertEqual(expected_text_completion, clean_sequence)
-
-    @slow
-    @require_torch_accelerator
-    def test_model_8b_chat_greedy_generation_bounding_box(self):
-        EXPECTED_TEXT_COMPLETION = "\x00194213202244\x01|ENDOFTEXT|"
-        text_prompt_bbox = "When presented with a box, perform OCR to extract text contained within it. If provided with text, generate the corresponding bounding box.\\nWilliams"  # noqa: E231
-
-        bbox_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bbox_sample_image.png"
-        bbox_image_pil = Image.open(io.BytesIO(requests.get(bbox_image_url).content))
-
-        model_inputs_bbox = self.processor(text=text_prompt_bbox, images=bbox_image_pil)
-        generated_tokens = self.model.generate(**model_inputs_bbox, max_new_tokens=10)
-        text = self.processor.tokenizer.batch_decode(generated_tokens)
-        end_sequence = text[0].split("\x04")[1]
-        clean_sequence = (
-            end_sequence[: end_sequence.find("|ENDOFTEXT|") + len("|ENDOFTEXT|")]
-            if "|ENDOFTEXT|" in end_sequence
-            else end_sequence
-        )
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, clean_sequence)
-"""
diff --git a/tests/transformers/models/gemma/__init__.py b/tests/transformers/models/gemma/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/gemma/test_modeling_gemma.py b/tests/transformers/models/gemma/test_modeling_gemma.py
deleted file mode 100644
index 80bfe69ce..000000000
--- a/tests/transformers/models/gemma/test_modeling_gemma.py
+++ /dev/null
@@ -1,536 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Gemma model. """
-
-import unittest
-
-import numpy as np
-from parameterized import parameterized
-
-from mindnlp.transformers import AutoModelForCausalLM, AutoTokenizer, GemmaConfig
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import GemmaForCausalLM, GemmaForSequenceClassification, GemmaModel
-
-
-class GemmaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        num_key_value_heads=2,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-        self.head_dim = self.hidden_size // self.num_attention_heads
-
-    # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTester.prepare_config_and_inputs
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ops.tril(ops.ones(self.batch_size, self.seq_length))
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    # Ignore copy
-    def get_config(self):
-        return GemmaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_key_value_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-            head_dim=self.head_dim,
-        )
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Gemma
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = GemmaModel(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Gemma
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = GemmaModel(config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Gemma
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = GemmaForCausalLM(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Gemma
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = GemmaForCausalLM(config=config)
-
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Gemma
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (GemmaModel, GemmaForCausalLM, GemmaForSequenceClassification) if is_mindspore_available() else ()
-    all_generative_model_classes = (GemmaForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": GemmaModel,
-            "text-classification": GemmaForSequenceClassification,
-            "text-generation": GemmaForCausalLM,
-            "zero-shot": GemmaForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-
-    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return True
-
-    def setUp(self):
-        self.model_tester = GemmaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GemmaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_Gemma_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        print(config)
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = GemmaForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Gemma_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = GemmaForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Gemma_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(mindspore.float32)
-        model = GemmaForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @unittest.skip("TODO @gante fix this for Llama")
-    @parameterized.expand([(1, False), (1, True), (4, False)])
-    def test_new_cache_format(self, num_beams, do_sample):
-        pass
-
-    @unittest.skip("Gemma buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip("Gemma uses GQA on all models so the KV cache is a non standard format")
-    def test_past_key_values_format(self):
-        pass
-
-
-@require_mindspore
-@slow
-class GemmaIntegrationTest(unittest.TestCase):
-    input_text = ["Hello I am doing", "Hi today"]
-
-    def test_model_2b_fp32(self):
-        model_id = "google/gemma-2b"
-        EXPECTED_TEXTS = [
-            "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
-        ]
-
-        model = AutoModelForCausalLM.from_pretrained(model_id, ms_dtype=mindspore.float32)
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = tokenizer(self.input_text, return_tensors="ms", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-        self.assertEqual(output_text, EXPECTED_TEXTS)
-
-    def test_model_2b_fp16(self):
-        model_id = "google/gemma-2b"
-        EXPECTED_TEXTS = [
-            "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
-        ]
-
-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, ms_dtype=mindspore.float16)
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = tokenizer(self.input_text, return_tensors="ms", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-        print(output_text)
-
-        self.assertEqual(output_text, EXPECTED_TEXTS)
-
-    def test_model_2b_fp16_static_cache(self):
-        model_id = "google/gemma-2b"
-        EXPECTED_TEXTS = [
-            "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
-        ]
-
-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, ms_dtype=mindspore.float16)
-
-        model.generation_config.cache_implementation = "static"
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = tokenizer(self.input_text, return_tensors="ms", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-
-        self.assertEqual(output_text, EXPECTED_TEXTS)
-
-    # def test_model_2b_bf16(self):
-    #     model_id = "google/gemma-2b"
-    #     EXPECTED_TEXTS = [
-    #         "Hello I am doing a project on the 1990s and I need to know what the most popular music",
-    #         "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
-    #     ]
-
-    #     model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, ms_dtype=mindspore.bfloat16)
-
-    #     tokenizer = AutoTokenizer.from_pretrained(model_id)
-    #     inputs = tokenizer(self.input_text, return_tensors="ms", padding=True)
-
-    #     output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-    #     output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-
-    #     self.assertEqual(output_text, EXPECTED_TEXTS)
-
-    def test_model_2b_eager(self):
-        model_id = "google/gemma-2b"
-        EXPECTED_TEXTS = [
-            "Hello I am doing a project on the 1990s and I am looking for some information on the ",
-            "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
-        ]
-
-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, ms_dtype=mindspore.float16)
-
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = tokenizer(self.input_text, return_tensors="ms", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-
-        self.assertEqual(output_text, EXPECTED_TEXTS)
-
-    @unittest.skip("The test will not fit our CI runners")
-    def test_model_7b_fp32(self):
-        model_id = "google/gemma-7b"
-        EXPECTED_TEXTS = [
-            "Hello my name is ***** ***** I will be assisting you today. I am sorry to hear about your issue. I will",
-            "Hi,\n\nI have a problem with my 2005 1.6 16",
-        ]
-
-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True)
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = tokenizer(self.input_text, return_tensors="ms", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-
-        self.assertEqual(output_text, EXPECTED_TEXTS)
-
-    def test_model_7b_fp16(self):
-        model_id = "google/gemma-7b"
-        EXPECTED_TEXTS = [
-            """Hello I am doing a project on a 1999 4.0L 4x4. I""",
-            "Hi today I am going to show you how to make a simple and easy to make a DIY 3D",
-        ]
-
-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, ms_dtype=mindspore.float16)
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = tokenizer(self.input_text, return_tensors="ms", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-
-        self.assertEqual(output_text, EXPECTED_TEXTS)
-
-    # def test_model_7b_bf16(self):
-    #     model_id = "google/gemma-7b"
-    #     EXPECTED_TEXTS = [
-    #         """Hello I am doing a project on a 1991 240sx and I am trying to find""",
-    #         "Hi today I am going to show you how to make a very simple and easy to make a very simple and",
-    #     ]
-
-    #     model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, ms_dtype=mindspore.bfloat16)
-
-    #     tokenizer = AutoTokenizer.from_pretrained(model_id)
-    #     inputs = tokenizer(self.input_text, return_tensors="ms", padding=True)
-
-    #     output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-    #     output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-
-    #     self.assertEqual(output_text, EXPECTED_TEXTS)
-
-    def test_model_7b_fp16_static_cache(self):
-        model_id = "google/gemma-7b"
-        EXPECTED_TEXTS = [
-            """Hello I am doing a project on a 1999 4.0L 4x4. I""",
-            "Hi today I am going to show you how to make a simple and easy to make a DIY 3D",
-        ]
-
-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, ms_dtype=mindspore.float16)
-
-        model.generation_config.cache_implementation = "static"
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = tokenizer(self.input_text, return_tensors="ms", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-
-        self.assertEqual(output_text, EXPECTED_TEXTS)
diff --git a/tests/transformers/models/gemma2/__init__.py b/tests/transformers/models/gemma2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/gemma2/test_modeling_gemma2.py b/tests/transformers/models/gemma2/test_modeling_gemma2.py
deleted file mode 100644
index 0c85127e5..000000000
--- a/tests/transformers/models/gemma2/test_modeling_gemma2.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Gemma2 model."""
-
-import unittest
-
-from pytest import mark
-
-from mindnlp.transformers import AutoModelForCausalLM, AutoTokenizer, Gemma2Config
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_mindspore_gpu,
-    slow,
-)
-
-from ...models.gemma.test_modeling_gemma import GemmaModelTest, GemmaModelTester
-from ...test_configuration_common import ConfigTester
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-    from mindnlp.transformers import (
-        Gemma2ForCausalLM,
-        Gemma2ForSequenceClassification,
-        Gemma2ForTokenClassification,
-        Gemma2Model,
-    )
-
-
-class Gemma2ModelTester(GemmaModelTester):
-    if is_mindspore_available():
-        config_class = Gemma2Config
-        model_class = Gemma2Model
-        for_causal_lm_class = Gemma2ForCausalLM
-        for_sequence_class = Gemma2ForSequenceClassification
-        for_token_class = Gemma2ForTokenClassification
-
-
-@require_mindspore
-class Gemma2ModelTest(GemmaModelTest, unittest.TestCase):
-    all_model_classes = (
-        (Gemma2Model, Gemma2ForCausalLM, Gemma2ForSequenceClassification, Gemma2ForTokenClassification)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": Gemma2Model,
-            "text-classification": Gemma2ForSequenceClassification,
-            "token-classification": Gemma2ForTokenClassification,
-            "text-generation": Gemma2ForCausalLM,
-            "zero-shot": Gemma2ForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-    _is_stateful = True
-    model_split_percents = [0.5, 0.6]
-    _torch_compile_test_ckpt = "google/gemma-2-9b"
-
-    def setUp(self):
-        self.model_tester = Gemma2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Gemma2Config, hidden_size=37)
-
-    @unittest.skip("Failing because of unique cache (HybridCache)")
-    def test_model_outputs_equivalence(self, **kwargs):
-        pass
-
-    @unittest.skip("Gemma2's eager attn/sdpa attn outputs are expected to be different")
-    def test_eager_matches_sdpa_inference(self):
-        pass
-
-    @unittest.skip("Gemma2's eager attn/sdpa attn outputs are expected to be different")
-    def test_sdpa_equivalence(self):
-        pass
-
-    def test_eager_attention_loaded_by_default(self):
-        """Gemma 2 + SDPA = inferior results, because of the logit softcapping. Eager is the default."""
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        model = Gemma2Model(config)
-        self.assertTrue(model.config._attn_implementation == "eager")
-
-
-@slow
-@require_mindspore
-class Gemma2IntegrationTest(unittest.TestCase):
-    input_text = ["Hello I am doing", "Hi today"]
-    # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
-    # Depending on the hardware we get different logits / generations
-    cuda_compute_capability_major_version = None
-
-
-    def test_model_9b_bf16(self):
-        model_id = "google/gemma-2-9b"
-        EXPECTED_TEXTS = [
-            "<bos>Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many",
-            "<pad><pad><bos>Hi today I'm going to be talking about the history of the United States. The United States of America",
-        ]
-
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=mindspore.bfloat16, attn_implementation="eager"
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = tokenizer(self.input_text, return_tensors="ms", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=False)
-
-        self.assertEqual(output_text, EXPECTED_TEXTS)
-
-    def test_model_9b_fp16(self):
-        model_id = "google/gemma-2-9b"
-        EXPECTED_TEXTS = [
-            "<bos>Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many",
-            "<pad><pad><bos>Hi today I'm going to be talking about the history of the United States. The United States of America",
-        ]
-
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id, low_cpu_mem_usage=True, torch_dtype=mindspore.float16, attn_implementation="eager"
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = tokenizer(self.input_text, return_tensors="ms", padding=True)
-        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=False)
-
-        self.assertEqual(output_text, EXPECTED_TEXTS)
-
-    def test_model_9b_pipeline_bf16(self):
-        # See https://github.com/huggingface/transformers/pull/31747 -- pipeline was broken for Gemma2 before this PR
-        model_id = "google/gemma-2-9b"
-        # EXPECTED_TEXTS should match the same non-pipeline test, minus the special tokens
-        EXPECTED_TEXTS = [
-            "Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many",
-            "Hi today I'm going to be talking about the history of the United States. The United States of America",
-        ]
-
-        model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=mindspore.bfloat16)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
-
-        output = pipe(self.input_text, max_new_tokens=20, do_sample=False, padding=True)
-
-        self.assertEqual(output[0][0]["generated_text"], EXPECTED_TEXTS[0])
-        self.assertEqual(output[1][0]["generated_text"], EXPECTED_TEXTS[1])
diff --git a/tests/transformers/models/git/__init__.py b/tests/transformers/models/git/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/git/test_modeling_git.py b/tests/transformers/models/git/test_modeling_git.py
deleted file mode 100644
index 224c81c79..000000000
--- a/tests/transformers/models/git/test_modeling_git.py
+++ /dev/null
@@ -1,600 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import unittest
-import numpy as np
-
-from huggingface_hub import hf_hub_download
-from mindspore import ops
-
-from mindnlp.transformers import GitConfig, GitProcessor, GitVisionConfig
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow, is_mindspore_available, is_vision_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn
-
-    from mindnlp.transformers import MODEL_FOR_CAUSAL_LM_MAPPING, GitForCausalLM, GitModel, GitVisionModel
-
-if is_vision_available():
-    from PIL import Image
-
-
-class GitVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=32,
-        patch_size=16,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return GitVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = GitVisionModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class GitVisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as GIT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (GitVisionModel,) if is_mindspore_available() else ()
-    fx_compatible = True
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = GitVisionModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GitVisionConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="GIT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="GitVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="GitVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/git-base"
-        model = GitVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class GitModelTester:
-    def __init__(
-        self,
-        parent,
-        num_channels=3,
-        image_size=32,
-        patch_size=16,
-        batch_size=13,
-        text_seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.batch_size = batch_size
-        self.text_seq_length = text_seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.scope = scope
-
-        # make sure the BOS, EOS and PAD tokens are within the vocab
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-        # for GIT, the sequence length is the sum of the text and patch tokens, + 1 due to the CLS token
-        self.seq_length = self.text_seq_length + int((self.image_size / self.patch_size) ** 2) + 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.text_seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.text_seq_length])
-
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, pixel_values
-
-    def get_config(self):
-        """
-        Returns a tiny configuration by default.
-        """
-        return GitConfig(
-            vision_config={
-                "num_channels": self.num_channels,
-                "image_size": self.image_size,
-                "patch_size": self.patch_size,
-                "hidden_size": self.hidden_size,
-                "projection_dim": 32,
-                "num_hidden_layers": self.num_hidden_layers,
-                "num_attention_heads": self.num_attention_heads,
-            },
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask, pixel_values):
-        model = GitModel(config=config)
-        model.set_train(False)
-
-        # inference with pixel values
-        result = model(input_ids, attention_mask=input_mask, pixel_values=pixel_values)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-        # inference without pixel values
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.text_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_for_causal_lm(self, config, input_ids, input_mask, pixel_values):
-        model = GitForCausalLM(config=config)
-        model.set_train(False)
-
-        # inference with pixel values
-        result = model(input_ids, attention_mask=input_mask, pixel_values=pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-        # inference without pixel values
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.text_seq_length, self.vocab_size))
-
-        # training
-        result = model(input_ids, attention_mask=input_mask, pixel_values=pixel_values, labels=input_ids)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertTrue(result.loss.item() > 0)
-
-    def _test_beam_search_generate(self, config, input_ids, input_mask, pixel_values):
-        model = GitForCausalLM(config=config)
-        model.set_train(False)
-
-        # generate
-        generated_ids = model.generate(
-            input_ids,
-            attention_mask=input_mask,
-            pixel_values=pixel_values,
-            do_sample=False,
-            max_length=20,
-            num_beams=2,
-            num_return_sequences=2,
-        )
-
-        self.parent.assertEqual(generated_ids.shape, (self.batch_size * 2, 20))
-
-    def _test_batched_generate_captioning(self, config, input_ids, input_mask, pixel_values):
-        model = GitForCausalLM(config=config)
-        model.set_train(False)
-
-        # generate
-        generated_ids = model.generate(
-            input_ids=None,  # captioning -> no input_ids
-            attention_mask=None,
-            pixel_values=pixel_values,
-            do_sample=False,
-            max_length=20,
-            num_beams=2,
-            num_return_sequences=2,
-        )
-
-        self.parent.assertEqual(generated_ids.shape, (self.batch_size * 2, 20))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            pixel_values,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "pixel_values": pixel_values,
-        }
-
-        return config, inputs_dict
-
-
-@require_mindspore
-class GitModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (GitModel, GitForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (GitForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": GitModel, "image-to-text": GitForCausalLM, "text-generation": GitForCausalLM}
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = False
-    test_torchscript = False
-
-    # special case for GitForCausalLM model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_CAUSAL_LM_MAPPING):
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.text_seq_length),
-                    dtype=mindspore.int64,
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = GitModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GitConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_beam_search_generate(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester._test_beam_search_generate(*config_and_inputs)
-
-    def test_batched_generate_captioning(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester._test_batched_generate_captioning(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-
-    def _check_attentions_for_generate(
-        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        # GIT attention shape depends on image inputs, overwrite
-        self.assertIsInstance(attentions, tuple)
-        self.assertListEqual(
-            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
-        )
-        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
-        image_length = int((config.vision_config.image_size / config.vision_config.patch_size) ** 2 + 1)
-
-        for idx, iter_attentions in enumerate(attentions):
-            tgt_len = min_length + idx + image_length if not use_cache else 1
-            src_len = min_length + idx + image_length
-
-            expected_shape = (
-                batch_size * num_beam_groups,
-                config.num_attention_heads,
-                tgt_len,
-                src_len,
-            )
-            # check attn size
-            self.assertListEqual(
-                [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions)
-            )
-
-    def _check_hidden_states_for_generate(
-        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        # GIT attention shape depends on image inputs, overwrite
-        self.assertIsInstance(hidden_states, tuple)
-        self.assertListEqual(
-            [isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states],
-            [True] * len(hidden_states),
-        )
-        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
-        image_length = int((config.vision_config.image_size / config.vision_config.patch_size) ** 2 + 1)
-
-        for idx, iter_hidden_states in enumerate(hidden_states):
-            seq_len = min_length + idx + image_length if not use_cache else 1
-            expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
-            # check hidden size
-            self.assertListEqual(
-                [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states],
-                [expected_shape] * len(iter_hidden_states),
-            )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/git-base"
-        model = GitModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="GIT has pixel values as additional input")
-    def test_beam_search_generate_dict_outputs_use_cache(self):
-        pass
-
-    @unittest.skip(reason="GIT has pixel values as additional input")
-    def test_contrastive_generate(self):
-        pass
-
-    @unittest.skip(reason="GIT has pixel values as additional input")
-    def test_contrastive_generate_dict_outputs_use_cache(self):
-        pass
-
-    @unittest.skip(reason="GIT has pixel values as additional input")
-    def test_contrastive_generate_low_memory(self):
-        pass
-
-    @unittest.skip(reason="GIT has pixel values as additional input")
-    def test_greedy_generate_dict_outputs_use_cache(self):
-        pass
-
-    @unittest.skip(reason="GIT has pixel values as additional input")
-    def test_dola_decoding_sample(self):
-        pass
-
-
-@require_mindspore
-@require_vision
-@slow
-class GitModelIntegrationTest(unittest.TestCase):
-    def test_forward_pass(self):
-        processor = GitProcessor.from_pretrained("microsoft/git-base")
-        model = GitForCausalLM.from_pretrained("microsoft/git-base")
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = processor(images=image, text="hello world", return_tensors="ms")
-
-        outputs = model(**inputs)
-
-        expected_shape = (1, 201, 30522)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[-0.9514, -0.9512, -0.9507], [-0.5454, -0.5453, -0.5453], [-0.8862, -0.8857, -0.8848]],
-        )
-        self.assertTrue(np.allclose(outputs.logits[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-    def test_inference_image_captioning(self):
-        processor = GitProcessor.from_pretrained("microsoft/git-base")
-        model = GitForCausalLM.from_pretrained("microsoft/git-base")
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = processor(images=image, return_tensors="ms")
-        pixel_values = inputs.pixel_values
-
-        outputs = model.generate(
-            pixel_values=pixel_values, max_length=20, output_scores=True, return_dict_in_generate=True
-        )
-        generated_caption = processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0]
-
-        expected_shape = (1, 9)
-        self.assertEqual(outputs.sequences.shape, expected_shape)
-        self.assertEqual(generated_caption, "two cats laying on a pink blanket")
-        self.assertTrue(outputs.scores[-1].shape, expected_shape)
-        expected_slice = mindspore.tensor([[-0.8805, -0.8803, -0.8799]])
-        self.assertTrue(np.allclose(outputs.scores[-1][0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-    def test_visual_question_answering(self):
-        processor = GitProcessor.from_pretrained("microsoft/git-base-textvqa")
-        model = GitForCausalLM.from_pretrained("microsoft/git-base-textvqa")
-
-        # prepare image
-        file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
-        image = Image.open(file_path).convert("RGB")
-        inputs = processor(images=image, return_tensors="ms")
-        pixel_values = inputs.pixel_values
-
-        # prepare question
-        question = "what does the front of the bus say at the top?"
-        input_ids = processor(text=question, add_special_tokens=False).input_ids
-        input_ids = [processor.tokenizer.cls_token_id] + input_ids
-        input_ids = mindspore.tensor(input_ids).unsqueeze(0)
-
-        generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=20)
-        generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-
-        expected_shape = (1, 15)
-        self.assertEqual(generated_ids.shape, expected_shape)
-        self.assertEqual(generated_caption, "what does the front of the bus say at the top? special")
-
-    def test_batched_generation(self):
-        processor = GitProcessor.from_pretrained("microsoft/git-base-coco")
-        model = GitForCausalLM.from_pretrained("microsoft/git-base-coco")
-
-        # create batch of size 2
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = processor(images=[image, image], return_tensors="ms")
-        pixel_values = inputs.pixel_values
-
-        # we have to prepare `input_ids` with the same batch size as `pixel_values`
-        start_token_id = model.config.bos_token_id
-        input_ids = mindspore.tensor([[start_token_id], [start_token_id]])
-        generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
-        generated_captions = processor.batch_decode(generated_ids, skip_special_tokens=True)
-
-        self.assertEqual(generated_captions, ["two cats sleeping on a pink blanket next to remotes."] * 2)
diff --git a/tests/transformers/models/git/test_processor_git.py b/tests/transformers/models/git/test_processor_git.py
deleted file mode 100644
index 6407e87a8..000000000
--- a/tests/transformers/models/git/test_processor_git.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-import pytest
-
-from mindnlp.utils.testing_utils import require_vision
-from mindnlp.utils import is_vision_available
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import AutoProcessor, BertTokenizer, CLIPImageProcessor, GitProcessor, PreTrainedTokenizerFast
-
-
-@require_vision
-class GitProcessorTest(unittest.TestCase):
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-
-        image_processor = CLIPImageProcessor()
-        tokenizer = BertTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-BertModel", model_input_names=["input_ids", "attention_mask"]
-        )
-
-        processor = GitProcessor(image_processor, tokenizer)
-
-        processor.save_pretrained(self.tmpdirname)
-
-    def get_tokenizer(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
-
-    def get_image_processor(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def prepare_image_inputs(self):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
-
-        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        return image_inputs
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = GitProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
-
-        processor = GitProcessor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
-
-        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, CLIPImageProcessor)
-
-    def test_image_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        image_input = self.prepare_image_inputs()
-
-        input_feat_extract = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_tokenizer(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tok = tokenizer(input_str, return_token_type_ids=False)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"])
-
-        # test if it raises when no input is passed
-        with pytest.raises(ValueError):
-            processor()
-
-    def test_tokenizer_decode(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_model_input_names(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        # For now the processor supports only ['input_ids', 'attention_mask', 'pixel_values']
-        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask", "pixel_values"])
diff --git a/tests/transformers/models/gpt2/__init__.py b/tests/transformers/models/gpt2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/gpt2/test_modeling_gpt2.py b/tests/transformers/models/gpt2/test_modeling_gpt2.py
deleted file mode 100644
index 041686701..000000000
--- a/tests/transformers/models/gpt2/test_modeling_gpt2.py
+++ /dev/null
@@ -1,828 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import datetime
-import gc
-import math
-import unittest
-import numpy as np
-
-from mindnlp.transformers import GPT2Config
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
-        GPT2DoubleHeadsModel,
-        GPT2ForQuestionAnswering,
-        GPT2ForSequenceClassification,
-        GPT2ForTokenClassification,
-        GPT2LMHeadModel,
-        GPT2Model,
-        GPT2Tokenizer,
-    )
-
-
-class GPT2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_token_type_ids=True,
-        use_input_mask=True,
-        use_labels=True,
-        use_mc_token_ids=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.use_mc_token_ids = use_mc_token_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = None
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-    def get_large_model_config(self):
-        return GPT2Config.from_pretrained("gpt2")
-
-    def prepare_config_and_inputs(
-        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
-    ):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        mc_token_ids = None
-        if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config(
-            gradient_checkpointing=gradient_checkpointing,
-            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-            reorder_and_upcast_attn=reorder_and_upcast_attn,
-        )
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(
-        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
-    ):
-        return GPT2Config(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            n_inner=self.intermediate_size,
-            activation_function=self.hidden_act,
-            resid_pdrop=self.hidden_dropout_prob,
-            attn_pdrop=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            gradient_checkpointing=gradient_checkpointing,
-            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-            reorder_and_upcast_attn=reorder_and_upcast_attn,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPT2Model(config=config)
-
-        model.set_train(False)
-
-        result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(len(result.past_key_values), config.n_layer)
-
-    def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPT2Model(config=config)
-
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
-        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_token_type_ids = ops.cat([token_type_ids, next_token_types], dim=-1)
-
-        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_gpt2_model_attention_mask_past(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = GPT2Model(config=config)
-
-        model.set_train(False)
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-        half_seq_length = self.seq_length // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_gpt2_model_past_large_inputs(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = GPT2Model(config=config)
-
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_token_type_ids = ops.cat([token_type_ids, next_token_types], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
-        )["last_hidden_state"]
-        output_from_past = model(
-            next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past
-        )["last_hidden_state"]
-        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPT2LMHeadModel(config)
-
-        model.set_train(False)
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_forward_and_backwards(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = GPT2LMHeadModel(config)
-
-        def forward(input_ids, token_type_ids):
-            result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-            self.parent.assertEqual(result.loss.shape, ())
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-            return result.loss
-
-        grad_fn = mindspore.value_and_grad(forward, None, tuple(model.parameters()))
-        grad_fn(input_ids, token_type_ids)
-
-    def create_and_check_double_lm_head_model(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
-    ):
-        model = GPT2DoubleHeadsModel(config)
-
-        model.set_train(False)
-
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-
-        inputs = {
-            "input_ids": multiple_choice_inputs_ids,
-            "mc_token_ids": mc_token_ids,
-            "attention_mask": multiple_choice_input_mask,
-            "token_type_ids": multiple_choice_token_type_ids,
-            "labels": multiple_choice_inputs_ids,
-        }
-
-        result = model(**inputs)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size)
-        )
-        self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_gpt2_for_question_answering(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-    ):
-        config.num_labels = self.num_labels
-        model = GPT2ForQuestionAnswering(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_gpt2_for_sequence_classification(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-    ):
-        config.num_labels = self.num_labels
-        model = GPT2ForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_gpt2_for_token_classification(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-    ):
-        config.num_labels = self.num_labels
-        model = GPT2ForTokenClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_gpt2_weight_initialization(self, config, *args):
-        model = GPT2Model(config)
-        model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer)
-        for key in model.parameters_dict().keys():
-            if "c_proj" in key and "weight" in key:
-                self.parent.assertLessEqual(abs(ops.std(model.parameters_dict()[key]) - model_std), 0.001)
-                self.parent.assertLessEqual(abs(ops.mean(model.parameters_dict()[key]) - 0.0), 0.01)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "head_mask": head_mask,
-        }
-
-        return config, inputs_dict
-
-
-@require_mindspore
-class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            GPT2LMHeadModel,
-            GPT2DoubleHeadsModel,
-            GPT2ForQuestionAnswering,
-            GPT2ForSequenceClassification,
-            GPT2ForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": GPT2Model,
-            "question-answering": GPT2ForQuestionAnswering,
-            "text-classification": GPT2ForSequenceClassification,
-            "text-generation": GPT2LMHeadModel,
-            "token-classification": GPT2ForTokenClassification,
-            "zero-shot": GPT2ForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    all_parallelizable_model_classes = (GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_mindspore_available() else ()
-    fx_compatible = True
-    test_missing_keys = False
-    test_model_parallel = True
-
-    # special case for DoubleHeads model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "GPT2DoubleHeadsModel":
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.num_choices, self.model_tester.seq_length),
-                    dtype=mindspore.int64
-                )
-                inputs_dict["input_ids"] = inputs_dict["labels"]
-                inputs_dict["token_type_ids"] = inputs_dict["labels"]
-                inputs_dict["mc_token_ids"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.num_choices),
-                    dtype=mindspore.int64
-                )
-                inputs_dict["mc_labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = GPT2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
-
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_gpt2_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
-
-    def test_gpt2_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_model_past(*config_and_inputs)
-
-    def test_gpt2_model_att_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_model_attention_mask_past(*config_and_inputs)
-
-    def test_gpt2_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_model_past_large_inputs(*config_and_inputs)
-
-    def test_gpt2_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    def test_gpt2_double_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
-
-    def test_gpt2_question_answering_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_for_question_answering(*config_and_inputs)
-
-    def test_gpt2_sequence_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_for_sequence_classification(*config_and_inputs)
-
-    def test_gpt2_token_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_for_token_classification(*config_and_inputs)
-
-    def test_gpt2_scale_attn_by_inverse_layer_idx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(scale_attn_by_inverse_layer_idx=True)
-        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs)
-
-    def test_gpt2_reorder_and_upcast_attn(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(reorder_and_upcast_attn=True)
-        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs)
-
-    def test_gpt2_weight_initialization(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt2_weight_initialization(*config_and_inputs)
-
-
-    @slow
-    def test_batch_generation(self):
-        model = GPT2LMHeadModel.from_pretrained("gpt2")
-
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-
-        tokenizer.padding_side = "left"
-
-        # Define PAD Token = EOS Token = 50256
-        tokenizer.pad_token = tokenizer.eos_token
-        model.config.pad_token_id = model.config.eos_token_id
-
-        # use different length sentences to test batching
-        sentences = [
-            "Hello, my dog is a little",
-            "Today, I",
-        ]
-
-        inputs = tokenizer(sentences, return_tensors="ms", padding=True)
-        input_ids = inputs["input_ids"]
-        token_type_ids = ops.cat(
-            [
-                ops.full((input_ids.shape[0], input_ids.shape[1] - 1), 0, dtype=input_ids.dtype),
-                ops.full((input_ids.shape[0], 1), 500, dtype=input_ids.dtype),
-            ],
-            dim=-1,
-        )
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"],
-        )
-
-        outputs_tt = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"],
-            token_type_ids=token_type_ids,
-        )
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="ms").input_ids
-        output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
-        inputs_padded = tokenizer(sentences[1], return_tensors="ms").input_ids
-        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "Hello, my dog is a little bit of a mess. I'm not sure if he's going",
-            "Today, I'm going to be doing a lot of research on this. I",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
-        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
-
-    @slow
-    def test_batch_generation_2heads(self):
-        model = GPT2DoubleHeadsModel.from_pretrained("gpt2")
-
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-
-        tokenizer.padding_side = "left"
-
-        # This tokenizer has no pad token, so we have to set it in some way
-        # Define PAD Token = EOS Token = 50256
-        tokenizer.pad_token = tokenizer.eos_token
-        model.config.pad_token_id = model.config.eos_token_id
-
-        # use different length sentences to test batching
-        sentences = [
-            "Hello, my dog is a little",
-            "Today, I",
-        ]
-
-        inputs = tokenizer(sentences, return_tensors="ms", padding=True)
-        input_ids = inputs["input_ids"]
-        token_type_ids = ops.cat(
-            [
-                ops.full((input_ids.shape[0], input_ids.shape[1] - 1), 0, dtype=input_ids.dtype),
-                ops.full((input_ids.shape[0], 1), 500, dtype=input_ids.dtype),
-            ],
-            dim=-1,
-        )
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"],
-        )
-
-        outputs_tt = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"],
-            token_type_ids=token_type_ids,
-        )
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="ms").input_ids
-        output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
-        inputs_padded = tokenizer(sentences[1], return_tensors="ms").input_ids
-        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "Hello, my dog is a little bit of a mess. I'm not sure if he's going",
-            "Today, I'm going to be doing a lot of research on this. I",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
-        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in GPT2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = GPT2Model.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-@require_mindspore
-class GPT2ModelLanguageGenerationTest(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-
-    def _test_lm_generate_gpt2_helper(
-        self,
-        reorder_and_upcast_attn=False,
-        scale_attn_by_inverse_layer_idx=False,
-        verify_outputs=True,
-    ):
-        model = GPT2LMHeadModel.from_pretrained(
-            "gpt2",
-            reorder_and_upcast_attn=reorder_and_upcast_attn,
-            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-        )
-
-        # The dog
-        input_ids = mindspore.tensor([[464, 3290]], dtype=mindspore.int64)
-
-        # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog
-        expected_output_ids = [464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290,]  # fmt: skip
-        output_ids = model.generate(input_ids, do_sample=False)
-        if verify_outputs:
-            self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
-
-    @slow
-    def test_lm_generate_gpt2(self):
-        self._test_lm_generate_gpt2_helper()
-
-    @slow
-    def test_lm_generate_gpt2_with_reorder_and_upcast_attn(self):
-        self._test_lm_generate_gpt2_helper(reorder_and_upcast_attn=True)
-
-    @slow
-    def test_lm_generate_gpt2_with_scale_attn_by_inverse_layer_idx(self):
-        self._test_lm_generate_gpt2_helper(scale_attn_by_inverse_layer_idx=True, verify_outputs=False)
-
-    @slow
-    def test_gpt2_sample(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-        model = GPT2LMHeadModel.from_pretrained("gpt2")
-
-
-        mindspore.set_seed(0)
-        tokenized = tokenizer("Today is a nice day and", return_tensors="ms", return_token_type_ids=True)
-        input_ids = tokenized.input_ids
-        output_ids = model.generate(input_ids, do_sample=True)
-        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-
-        token_type_ids = tokenized.token_type_ids
-        output_seq = model.generate(input_ids=input_ids, do_sample=True, num_return_sequences=5)
-        output_seq_tt = model.generate(
-            input_ids=input_ids, token_type_ids=token_type_ids, do_sample=True, num_return_sequences=5
-        )
-        output_seq_strs = tokenizer.batch_decode(output_seq, skip_special_tokens=True)
-        output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt, skip_special_tokens=True)
-
-        EXPECTED_OUTPUT_STR = (
-            "Today is a nice day and if you don't know anything about the state of play during your holiday"
-        )
-        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
-        self.assertTrue(
-            all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs)))
-        )  # token_type_ids should change output
-
-    @slow
-    def test_gpt2_sample_max_time(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-        model = GPT2LMHeadModel.from_pretrained("gpt2")
-
-
-        mindspore.set_seed(0)
-        tokenized = tokenizer("Today is a nice day and", return_tensors="ms", return_token_type_ids=True)
-        input_ids = tokenized.input_ids
-
-        MAX_TIME = 0.5
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=True, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, num_beams=2, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=True, num_beams=2, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, max_time=None, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-    @slow
-    def test_contrastive_search_gpt2(self):
-        article = (
-            "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research "
-            "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based"
-        )
-
-        gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
-        gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2-large")
-        input_ids = gpt2_tokenizer(article, return_tensors="ms").input_ids
-
-        outputs = gpt2_model.generate(input_ids, penalty_alpha=0.6, top_k=4, max_length=256)
-
-        generated_text = gpt2_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research "
-                "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based in London, "
-                "United Kingdom\n\nGoogle has a lot of data on its users and uses it to improve its products, such as "
-                "Google Now, which helps users find the information they're looking for on the web. But the company "
-                "is not the only one to collect data on its users. Facebook, for example, has its own facial "
-                "recognition technology, as well as a database of millions of photos that it uses to personalize its "
-                "News Feed.\n\nFacebook's use of data is a hot topic in the tech industry, with privacy advocates "
-                "concerned about the company's ability to keep users' information private. In a blog post last "
-                'year, Facebook CEO Mark Zuckerberg said his company would "do our best to be transparent about our '
-                'data use and how we use it."\n\n"We have made it clear that we do not sell or share your data with '
-                'third parties," Zuckerberg wrote. "If you have questions or concerns, please reach out to us at '
-                'privacy@facebook.com."\n\nGoogle declined to comment on the privacy implications of its use of data, '
-                "but said in a statement to The Associated Press that"
-            ],
-        )
diff --git a/tests/transformers/models/gpt_bigcode/__init__.py b/tests/transformers/models/gpt_bigcode/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/transformers/models/gpt_bigcode/test_modeling_gpt_bigcode.py
deleted file mode 100644
index 50dc6013a..000000000
--- a/tests/transformers/models/gpt_bigcode/test_modeling_gpt_bigcode.py
+++ /dev/null
@@ -1,613 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import unittest
-
-from parameterized import parameterized
-
-from mindnlp.transformers import GPTBigCodeConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        GPT2TokenizerFast,
-        GPTBigCodeForCausalLM,
-        GPTBigCodeForSequenceClassification,
-        GPTBigCodeForTokenClassification,
-        GPTBigCodeModel,
-    )
-    from mindnlp.transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeAttention
-
-
-class GPTBigCodeModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_token_type_ids=True,
-        use_input_mask=True,
-        use_labels=True,
-        use_mc_token_ids=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="relu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        multi_query=True,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.use_mc_token_ids = use_mc_token_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = None
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 2
-        self.pad_token_id = vocab_size - 3
-        self.multi_query = multi_query
-
-    def get_large_model_config(self):
-        return GPTBigCodeConfig.from_pretrained("bigcode/gpt_bigcode-santacoder")
-
-    def prepare_config_and_inputs(
-        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
-    ):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        mc_token_ids = None
-        if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config(
-            gradient_checkpointing=gradient_checkpointing,
-            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-            reorder_and_upcast_attn=reorder_and_upcast_attn,
-        )
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(
-        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
-    ):
-        return GPTBigCodeConfig(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            n_inner=self.intermediate_size,
-            activation_function=self.hidden_act,
-            resid_pdrop=self.hidden_dropout_prob,
-            attn_pdrop=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            gradient_checkpointing=gradient_checkpointing,
-            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-            reorder_and_upcast_attn=reorder_and_upcast_attn,
-            attention_softmax_in_fp32=False,
-            scale_attention_softmax_in_fp32=False,
-            multi_query=self.multi_query,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_gpt_bigcode_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPTBigCodeModel(config=config)
-        model.eval()
-
-        result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(len(result.past_key_values), config.n_layer)
-
-    def create_and_check_gpt_bigcode_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPTBigCodeModel(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
-        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_token_type_ids = ops.cat([token_type_ids, next_token_types], dim=-1)
-
-        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_gpt_bigcode_model_attention_mask_past(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = GPTBigCodeModel(config=config)
-        model.eval()
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-        half_seq_length = self.seq_length // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_gpt_bigcode_model_past_large_inputs(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = GPTBigCodeModel(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_token_type_ids = ops.cat([token_type_ids, next_token_types], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
-        )["last_hidden_state"]
-        output_from_past = model(
-            next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past
-        )["last_hidden_state"]
-        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPTBigCodeForCausalLM(config)
-        model.eval()
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_forward_and_backwards(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
-    ):
-        model = GPTBigCodeForCausalLM(config)
-        if gradient_checkpointing:
-            model.gradient_checkpointing_enable()
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        result.loss.backward()
-
-    def create_and_check_gpt_bigcode_for_sequence_classification(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-    ):
-        config.num_labels = self.num_labels
-        model = GPTBigCodeForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_gpt_bigcode_for_token_classification(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-    ):
-        config.num_labels = self.num_labels
-        model = GPTBigCodeForTokenClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_gpt_bigcode_weight_initialization(self, config, *args):
-        model = GPTBigCodeModel(config)
-        model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer)
-        for key in model.state_dict().keys():
-            if "c_proj" in key and "weight" in key:
-                self.parent.assertLessEqual(abs(ops.std(model.state_dict()[key]) - model_std), 0.001)
-                self.parent.assertLessEqual(abs(ops.mean(model.state_dict()[key]) - 0.0), 0.01)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "head_mask": head_mask,
-        }
-
-        return config, inputs_dict
-
-
-@require_mindspore
-class GPTBigCodeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    # TODO: Update the tests to use valid pretrained models.
-    all_model_classes = (
-        (
-            GPTBigCodeModel,
-            GPTBigCodeForCausalLM,
-            GPTBigCodeForSequenceClassification,
-            GPTBigCodeForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (GPTBigCodeForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": GPTBigCodeModel,
-            "text-classification": GPTBigCodeForSequenceClassification,
-            "text-generation": GPTBigCodeForCausalLM,
-            "token-classification": GPTBigCodeForTokenClassification,
-            "zero-shot": GPTBigCodeForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = False
-    test_missing_keys = False
-    test_pruning = False
-    test_torchscript = False
-    multi_query = True
-
-    # special case for DoubleHeads model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = GPTBigCodeModelTester(self, multi_query=self.multi_query)
-        self.config_tester = ConfigTester(self, config_class=GPTBigCodeConfig, n_embd=37)
-
-    def tearDown(self):
-        import gc
-
-        gc.collect()
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="MQA models does not support retain_grad")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="Contrastive search not supported due to non-standard caching mechanism")
-    def test_contrastive_generate(self):
-        pass
-
-    @unittest.skip(reason="Contrastive search not supported due to non-standard caching mechanism")
-    def test_contrastive_generate_dict_outputs_use_cache(self):
-        pass
-
-    @unittest.skip(reason="CPU offload seems to be broken for some reason - tiny models keep hitting corner cases")
-    def test_cpu_offload(self):
-        pass
-
-    @unittest.skip(reason="Disk offload seems to be broken for some reason - tiny models keep hitting corner cases")
-    def test_disk_offload(self):
-        pass
-
-    @unittest.skip(reason="BigCodeGPT has a non-standard KV cache format.")
-    def test_past_key_values_format(self):
-        pass
-
-    def test_gpt_bigcode_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt_bigcode_model(*config_and_inputs)
-
-    def test_gpt_bigcode_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt_bigcode_model_past(*config_and_inputs)
-
-    def test_gpt_bigcode_model_att_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt_bigcode_model_attention_mask_past(*config_and_inputs)
-
-    def test_gpt_bigcode_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt_bigcode_model_past_large_inputs(*config_and_inputs)
-
-    def test_gpt_bigcode_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    def test_gpt_bigcode_sequence_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt_bigcode_for_sequence_classification(*config_and_inputs)
-
-    def test_gpt_bigcode_token_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt_bigcode_for_token_classification(*config_and_inputs)
-
-    # def test_gpt_bigcode_gradient_checkpointing(self):
-    #     config_and_inputs = self.model_tester.prepare_config_and_inputs()
-    #     self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
-
-    def test_gpt_bigcode_scale_attn_by_inverse_layer_idx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(scale_attn_by_inverse_layer_idx=True)
-        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs)
-
-    def test_gpt_bigcode_reorder_and_upcast_attn(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(reorder_and_upcast_attn=True)
-        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs)
-
-    def test_gpt_bigcode_weight_initialization(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt_bigcode_weight_initialization(*config_and_inputs)
-
-
-@require_mindspore
-class GPTBigCodeMHAModelTest(GPTBigCodeModelTest):
-    # `parameterized_class` breaks with mixins, so we use inheritance instead
-    multi_query = False
-
-
-@slow
-@require_mindspore
-class GPTBigCodeModelLanguageGenerationTest(unittest.TestCase):
-    def test_generate_simple(self):
-        model = GPTBigCodeForCausalLM.from_pretrained("bigcode/gpt_bigcode-santacoder")
-        tokenizer = GPT2TokenizerFast.from_pretrained("bigcode/gpt_bigcode-santacoder")
-
-        input_ids = tokenizer("def print_hello_world():", return_tensors="ms").input_ids
-
-        output_sequence = model.generate(input_ids)
-        output_sentence = tokenizer.decode(output_sequence[0], skip_special_tokens=True)
-
-        expected_output = """def print_hello_world():\n    print("Hello World!")\n\n\ndef print_hello_"""
-        self.assertEqual(output_sentence, expected_output)
-
-    def test_generate_batched(self):
-        tokenizer = GPT2TokenizerFast.from_pretrained("bigcode/gpt_bigcode-santacoder")
-        tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.padding_side = "left"
-
-        model = GPTBigCodeForCausalLM.from_pretrained("bigcode/gpt_bigcode-santacoder")
-
-        inputs = tokenizer(["def print_hello_world():", "def say_hello():"], return_tensors="ms", padding=True)
-        outputs = model.generate(**inputs)
-        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        expected_output = [
-            'def print_hello_world():\n    print("Hello World!")\n\n\ndef print_hello_',
-            'def say_hello():\n    print("Hello, World!")\n\n\nsay_hello()',
-        ]
-        self.assertListEqual(outputs, expected_output)
-
-
-@require_mindspore
-class GPTBigCodeMQATest(unittest.TestCase):
-    def get_attention(self, multi_query):
-        config = GPTBigCodeConfig.from_pretrained(
-            "bigcode/gpt_bigcode-santacoder",
-            multi_query=multi_query,
-            attn_pdrop=0,
-            resid_pdrop=0,
-        )
-        return GPTBigCodeAttention(config)
-
-    @parameterized.expand([(seed, is_train_mode) for seed in range(5) for is_train_mode in [True, False]])
-    def test_mqa_reduces_to_mha(self, seed, is_train_mode=True):
-        mindspore.manual_seed(seed)
-        mindspore.set_seed(seed)
-
-        # CREATE MQA AND MHA ATTENTIONS
-        attention_mqa = self.get_attention(True)
-        attention_mha = self.get_attention(False)
-
-        # ENFORCE MATCHING WEIGHTS
-        num_heads = attention_mqa.num_heads
-        embed_dim = attention_mqa.embed_dim
-        head_dim = attention_mqa.head_dim
-
-        with no_grad():
-            mqa_q_weight = attention_mqa.c_attn.weight[:embed_dim, :].view(num_heads, 1, head_dim, embed_dim)
-            mqa_kv_weight = attention_mqa.c_attn.weight[embed_dim:, :].view(1, 2, head_dim, embed_dim)
-            mha_c_weight = ops.cat(
-                [mqa_q_weight, mqa_kv_weight.broadcast_to((num_heads, 2, head_dim, embed_dim))], dim=1
-            ).view(3 * num_heads * head_dim, embed_dim)
-
-            mqa_q_bias = attention_mqa.c_attn.bias[:embed_dim].view(num_heads, 1, head_dim)
-            mqa_kv_bias = attention_mqa.c_attn.bias[embed_dim:].view(1, 2, head_dim)
-            mha_c_bias = ops.cat([mqa_q_bias, mqa_kv_bias.broadcast_to((num_heads, 2, head_dim))], dim=1).view(
-                3 * num_heads * head_dim
-            )
-
-            attention_mha.c_attn.weight.assign_value(mha_c_weight)
-            attention_mha.c_attn.bias.assign_value(mha_c_bias)
-            attention_mha.c_proj.weight.assign_value(attention_mqa.c_proj.weight)
-            attention_mha.c_proj.bias.assign_value(attention_mqa.c_proj.bias)
-
-        # PUT THE MODEL INTO THE CORRECT MODE
-        attention_mha.train(is_train_mode)
-        attention_mqa.train(is_train_mode)
-
-        # RUN AN INPUT THROUGH THE MODELS
-        num_tokens = 5
-        hidden_states = ops.randn(1, num_tokens, embed_dim)
-        attention_mha_result = attention_mha(hidden_states)[0]
-        attention_mqa_result = attention_mqa(hidden_states)[0]
-
-        # CHECK THAT ALL OUTPUTS ARE THE SAME
-        self.assertTrue(ops.allclose(attention_mha_result, attention_mqa_result, atol=1e-5))
\ No newline at end of file
diff --git a/tests/transformers/models/gpt_neo/__init__.py b/tests/transformers/models/gpt_neo/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/gpt_neo/test_modeling_gpt_neo.py b/tests/transformers/models/gpt_neo/test_modeling_gpt_neo.py
deleted file mode 100644
index b472c3a8a..000000000
--- a/tests/transformers/models/gpt_neo/test_modeling_gpt_neo.py
+++ /dev/null
@@ -1,595 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch GPT Neo model."""
-
-import unittest
-
-from mindnlp.transformers import GPTNeoConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow
-from mindnlp.utils import cached_property, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        GPT2Tokenizer,
-        GPTNeoForCausalLM,
-        GPTNeoForQuestionAnswering,
-        GPTNeoForSequenceClassification,
-        GPTNeoForTokenClassification,
-        GPTNeoModel,
-    )
-
-
-class GPTNeoModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_token_type_ids=True,
-        use_input_mask=True,
-        use_labels=True,
-        use_mc_token_ids=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        attention_types=[[["global", "local"], 1]],
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        window_size=7,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.use_mc_token_ids = use_mc_token_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.window_size = window_size
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-        self.attention_types = attention_types
-
-    def get_large_model_config(self):
-        return GPTNeoConfig.from_pretrained("gpt-neo-125M")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        mc_token_ids = None
-        if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return GPTNeoConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_attention_heads,
-            max_position_embeddings=self.max_position_embeddings,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            window_size=self.window_size,
-            attention_types=self.attention_types,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_gpt_neo_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPTNeoModel(config=config)
-        model.eval()
-
-        result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        # past_key_values is not implemented
-        # self.parent.assertEqual(len(result.past_key_values), config.n_layer)
-
-    def create_and_check_gpt_neo_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPTNeoModel(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
-        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_token_type_ids = ops.cat([token_type_ids, next_token_types], dim=-1)
-
-        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_gpt_neo_model_attention_mask_past(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = GPTNeoModel(config=config)
-        model.eval()
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-        half_seq_length = self.seq_length // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_gpt_neo_model_past_large_inputs(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = GPTNeoModel(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_token_type_ids = ops.cat([token_type_ids, next_token_types], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
-        )["last_hidden_state"]
-        output_from_past = model(
-            next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past
-        )["last_hidden_state"]
-        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPTNeoForCausalLM(config)
-        model.eval()
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_gpt_neo_for_question_answering(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-    ):
-        config.num_labels = self.num_labels
-        model = GPTNeoForQuestionAnswering(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_gpt_neo_for_sequence_classification(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-    ):
-        config.num_labels = self.num_labels
-        model = GPTNeoForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_gpt_neo_for_token_classification(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-    ):
-        config.num_labels = self.num_labels
-        model = GPTNeoForTokenClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_forward_and_backwards(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
-    ):
-        model = GPTNeoForCausalLM(config)
-        if gradient_checkpointing:
-            model.gradient_checkpointing_enable()
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        result.loss.backward()
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "head_mask": head_mask,
-        }
-
-        return config, inputs_dict
-
-
-@require_mindspore
-class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            GPTNeoModel,
-            GPTNeoForCausalLM,
-            GPTNeoForQuestionAnswering,
-            GPTNeoForSequenceClassification,
-            GPTNeoForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (GPTNeoForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": GPTNeoModel,
-            "question-answering": GPTNeoForQuestionAnswering,
-            "text-classification": GPTNeoForSequenceClassification,
-            "text-generation": GPTNeoForCausalLM,
-            "token-classification": GPTNeoForTokenClassification,
-            "zero-shot": GPTNeoForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = True
-    test_missing_keys = False
-    test_pruning = False
-    test_model_parallel = False
-
-    # special case for DoubleHeads model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = GPTNeoModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTNeoConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_gpt_neo_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt_neo_model(*config_and_inputs)
-
-    def test_gpt_neo_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt_neo_model_past(*config_and_inputs)
-
-    def test_gpt_neo_model_att_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt_neo_model_attention_mask_past(*config_and_inputs)
-
-    def test_gpt_neo_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt_neo_model_past_large_inputs(*config_and_inputs)
-
-    def test_gpt_neo_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    def test_gpt_neo_question_answering_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt_neo_for_question_answering(*config_and_inputs)
-
-    def test_gpt_neo_sequence_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt_neo_for_sequence_classification(*config_and_inputs)
-
-    def test_gpt_neo_token_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gpt_neo_for_token_classification(*config_and_inputs)
-
-    # def test_gpt_neo_gradient_checkpointing(self):
-    #     config_and_inputs = self.model_tester.prepare_config_and_inputs()
-    #     self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
-
-    def _get_hidden_states(self):
-        return mindspore.tensor(
-            [
-                [
-                    [0.4983, -0.7584, -1.6944, 0.5440],
-                    [2.6918, 0.4206, 0.4176, 0.2055],
-                    [-0.0071, -0.0405, -1.4920, -0.3630],
-                    [1.0492, 0.1599, -1.7648, 0.2419],
-                    [-1.8348, 2.0514, -0.1946, 0.3203],
-                    [0.7672, -1.1600, -1.7118, -0.9056],
-                    [0.2986, 0.5372, 0.7729, -0.1927],
-                    [0.0285, 0.2629, -1.1156, -1.1992],
-                ]
-            ],
-            dtype=mindspore.float32,
-        )
-
-    def test_local_attn_probs(self):
-        model = GPTNeoModel.from_pretrained("valhalla/gpt-neo-random-tiny").eval()
-        layer = model.h[1].attn.attention
-        hidden_states = self._get_hidden_states()
-        hidden_states = ops.cat([hidden_states, hidden_states - 0.5], dim=2)
-
-        batch_size, seq_length, _ = hidden_states.shape
-        mask_tokens = 2
-        attention_mask = ops.ones(batch_size, seq_length, dtype=mindspore.int64)
-        attention_mask[:, -mask_tokens:] = 0  # dont attend last mask_tokens
-
-        attention_mask = attention_mask.view(batch_size, -1)
-        attention_mask = attention_mask[:, None, None, :]
-        attention_mask = (1.0 - attention_mask) * -10000.0
-
-        attn_probs = layer(hidden_states, attention_mask=attention_mask, output_attentions=True)[-1]
-
-        # the last 2 tokens are masked, and should have 0 attn_probs
-        self.assertTrue(ops.all(attn_probs[:, :, -mask_tokens:, -mask_tokens:] == 0))
-
-        # in loacal attention each token can only attend to the previous window_size tokens (inlcuding itself)
-        # here window_size is 4, so a token at index 5 can only attend to indcies [2, 3, 4, 5]
-        # and the attn_probs should be 0 for token [0, 1]
-        self.assertTrue(ops.all(attn_probs[:, :, 5, 2:6] != 0))
-        self.assertTrue(ops.all(attn_probs[:, :, 5, :2] == 0))
-
-
-@require_mindspore
-class GPTNeoModelLanguageGenerationTest(unittest.TestCase):
-    @cached_property
-    def model(self):
-        return GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
-
-    @cached_property
-    def tokenizer(self):
-        return GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
-
-    @slow
-    def test_lm_generate_gpt_neo(self):
-        for checkpointing in [True, False]:
-            model = self.model
-            if checkpointing:
-                model.gradient_checkpointing_enable()
-            else:
-                model.gradient_checkpointing_disable()
-            input_ids = mindspore.tensor([[464, 3290]], dtype=mindspore.int64)  # The dog
-            # The dog-eared copy of the book, which is a collection of essays by the late author,
-            expected_output_ids = [464, 3290, 12, 3380, 4866, 286, 262, 1492, 11, 543, 318, 257, 4947, 286, 27126, 416, 262, 2739, 1772, 11]  # fmt: skip
-            output_ids = model.generate(input_ids, do_sample=False)
-            self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
-
-    @slow
-    def test_gpt_neo_sample(self):
-        model = self.model
-        tokenizer = self.tokenizer
-
-        mindspore.manual_seed(0)
-        tokenized = tokenizer("Today is a nice day and", return_tensors="ms", return_token_type_ids=True)
-        input_ids = tokenized.input_ids
-        output_ids = model.generate(input_ids, do_sample=True)
-        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-
-        EXPECTED_OUTPUT_STR = "Today is a nice day and if you don’t get the memo here is what you can"
-        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
-
-    @slow
-    def test_batch_generation(self):
-        model = self.model
-        tokenizer = self.tokenizer
-
-        tokenizer.padding_side = "left"
-
-        # Define PAD Token = EOS Token = 50256
-        tokenizer.pad_token = tokenizer.eos_token
-        model.config.pad_token_id = model.config.eos_token_id
-
-        # use different length sentences to test batching
-        sentences = [
-            "Hello, my dog is a little",
-            "Today, I am",
-        ]
-
-        inputs = tokenizer(sentences, return_tensors="ms", padding=True)
-        input_ids = inputs["input_ids"]
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"],
-        )
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="ms").input_ids
-        output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
-        inputs_padded = tokenizer(sentences[1], return_tensors="ms").input_ids
-        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "Hello, my dog is a little bit of a kitty. She is a very sweet and loving",
-            "Today, I am going to talk about the best way to get a job in the",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "EleutherAI/gpt-neo-1.3B"
-        model = GPTNeoModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
\ No newline at end of file
diff --git a/tests/transformers/models/gpt_neox/__init__.py b/tests/transformers/models/gpt_neox/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/gpt_neox/test_modeling_gpt_neox.py b/tests/transformers/models/gpt_neox/test_modeling_gpt_neox.py
deleted file mode 100644
index 5ee886ca5..000000000
--- a/tests/transformers/models/gpt_neox/test_modeling_gpt_neox.py
+++ /dev/null
@@ -1,359 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The Eleuther AI and HuggingFace Inc. team. All rights reserved.
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from parameterized import parameterized
-import numpy as np
-import mindspore
-import unittest
-from mindnlp.core import ops
-from mindnlp.transformers.models.gpt_neox import (
-    GPTNeoXModel,
-    GPTNeoXConfig,
-    GPTNeoXForCausalLM,
-    GPTNeoXForQuestionAnswering,
-    GPTNeoXForSequenceClassification,
-    GPTNeoXForTokenClassification,
-)
-
-from mindnlp.transformers import AutoTokenizer
-from mindnlp.utils import is_mindspore_available
-from mindnlp.engine.utils import set_seed
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-class GPTNeoXModelTester:
-    def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=64,
-            num_hidden_layers=2,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.pad_token_id = vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_labels = None
-        if self.use_labels:
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, token_labels
-
-    def get_config(self):
-        return GPTNeoXConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        config, input_ids, input_mask, token_labels = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-
-        return config, input_ids, input_mask, token_labels
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = GPTNeoXModel(config=config)
-        model.set_train(False)
-        _ = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(self, config, input_ids, input_mask):
-        config.add_cross_attention = True
-        model = GPTNeoXModel(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(self, config, input_ids, input_mask, token_labels):
-        model = GPTNeoXForCausalLM(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_question_answering(self, config, input_ids, input_mask, token_labels):
-        config.num_labels = self.num_labels
-        model = GPTNeoXForQuestionAnswering(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(self, config, input_ids, input_mask, token_labels):
-        config.num_labels = self.num_labels
-        model = GPTNeoXForSequenceClassification(config)
-        model.set_train(False)
-        sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(self, config, input_ids, input_mask, token_labels):
-        config.num_labels = self.num_labels
-        model = GPTNeoXForTokenClassification(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, input_ids, input_mask):
-        config.is_decoder = True
-        model = GPTNeoXForCausalLM(config=config)
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=input_mask, use_cache=True)
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask, output_hidden_states=True)
-        output_from_no_past = output_from_no_past["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask, token_labels = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            GPTNeoXForCausalLM,
-            GPTNeoXForQuestionAnswering,
-            GPTNeoXForSequenceClassification,
-            GPTNeoXForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (GPTNeoXForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": GPTNeoXModel,
-            "question-answering": GPTNeoXForQuestionAnswering,
-            "text-classification": GPTNeoXForSequenceClassification,
-            "text-generation": GPTNeoXForCausalLM,
-            "token-classification": GPTNeoXForTokenClassification,
-            "zero-shot": GPTNeoXForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_missing_keys = False
-    test_model_parallel = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = GPTNeoXModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTNeoXConfig, hidden_size=64, num_attention_heads=8)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(config, input_ids, input_mask)
-
-    def test_model_as_decoder(self):
-        config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(config, input_ids, input_mask)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(config, input_ids, input_mask)
-
-    def test_decoder_model_past_large_inputs(self):
-        config, input_ids, input_mask, token_labels = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(config, input_ids, input_mask)
-
-    def test_model_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_model_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_model_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_model_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_feed_forward_chunking(self):
-        pass
-
-    @parameterized.expand([("linear",), ("dynamic",)])
-    def test_model_rope_scaling(self, scaling_type):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        short_input = ids_tensor([1, 10], config.vocab_size)
-        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        original_model = GPTNeoXModel(config)
-        original_model.set_train(False)
-        original_short_output = original_model(short_input).last_hidden_state
-        original_long_output = original_model(long_input).last_hidden_state
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
-        scaled_model = GPTNeoXModel(config)
-        scaled_model.set_train(False)
-        scaled_short_output = scaled_model(short_input).last_hidden_state
-        scaled_long_output = scaled_model(long_input).last_hidden_state
-
-        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
-        # maximum sequence length, so the outputs for the short input should match.
-        if scaling_type == "dynamic":
-            self.assertTrue(np.allclose(original_short_output.asnumpy(), scaled_short_output.asnumpy(), atol=1e-5))
-        else:
-            self.assertFalse(np.allclose(original_short_output.asnumpy(), scaled_short_output.asnumpy(), atol=1e-5))
-
-        # The output should be different for long inputs
-        self.assertFalse(np.allclose(original_long_output.asnumpy(), scaled_long_output.asnumpy(), atol=1e-5))
-
-
-@require_mindspore
-class GPTNeoXLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_gptneox(self):
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-410m-deduped")
-        for checkpointing in [True, False]:
-            model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-410m-deduped")
-
-            if checkpointing:
-                model.gradient_checkpointing_enable()
-            # else:
-            #     model.gradient_checkpointing_disable()
-
-            inputs = tokenizer("My favorite food is", return_tensors="ms")
-            # The hub repo. is updated on 2023-04-04, resulting in poor outputs.
-            # See: https://github.com/huggingface/transformers/pull/24193
-            expected_output = "My favorite food is a good old-fashioned, old-fashioned, old-fashioned.\n\nI'm not sure"
-
-            output_ids = model.generate(**inputs, do_sample=False, max_new_tokens=20)
-            output_str = tokenizer.batch_decode(output_ids)[0]
-
-            self.assertEqual(output_str, expected_output)
-
-    @slow
-    def test_pythia_integration(self):
-        model_name_or_path = "EleutherAI/pythia-70m"
-        model = GPTNeoXForCausalLM.from_pretrained(model_name_or_path, ms_dtype=mindspore.float16)
-        EXPECTED_LOGITS = mindspore.tensor([1069.0000,  228.7500, 1072.0000, 1072.0000, 1069.0000, 1068.0000, 1068.0000, 1071.0000, 1071.0000, 1071.0000, 1073.0000, 1070.0000, 1071.0000, 1075.0000, 1073.0000, 1075.0000, 1074.0000, 1069.0000, 1072.0000, 1071.0000, 1071.0000, 1071.0000, 1070.0000, 1069.0000, 1069.0000, 1069.0000, 1070.0000, 1075.0000, 1073.0000, 1074.0000])  # fmt: skip
-        input_ids = [29, 93, 303, 64, 5478, 49651, 10394, 187, 34, 12939, 875]
-        # alternative: tokenizer('<|im_start|>system\nA chat between')
-        def as_tensor(value, dtype=None):
-            if isinstance(value, list) and isinstance(value[0], np.ndarray):
-                return mindspore.tensor(np.array(value), dtype)
-            if isinstance(value, np.ndarray) and value.shape == (0,):
-                return mindspore.tensor(mindspore._c_expression.Tensor(value, dtype)) # pylint: disable=c-extension-no-member
-            return mindspore.tensor(value, dtype)
-        input_ids = as_tensor(input_ids)[None]
-        outputs = model(input_ids)["logits"][:, -1][0, :30]
-        print(EXPECTED_LOGITS.asnumpy(), outputs.asnumpy())
-        self.assertTrue(np.allclose(EXPECTED_LOGITS.asnumpy(), outputs.asnumpy(), atol=1e-5))
diff --git a/tests/transformers/models/gpt_neox_japanese/__init__.py b/tests/transformers/models/gpt_neox_japanese/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py b/tests/transformers/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
deleted file mode 100644
index d95cf58d9..000000000
--- a/tests/transformers/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore GPTNeoXJapanese model."""
-
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import GPTNeoXJapaneseConfig
-from mindnlp.transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese import (
-    GPTNeoXJapaneseTokenizer,
-)
-from mindspore import ops
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import GPTNeoXJapaneseForCausalLM, GPTNeoXJapaneseModel
-
-
-class GPTNeoXJapaneseModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_multiple_size=4,
-        hidden_act="gelu",
-        hidden_dropout=0.0,
-        attention_dropout=0.1,
-        weight_tying=True,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_multiple_size = intermediate_multiple_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout = hidden_dropout
-        self.attention_dropout = attention_dropout
-        self.weight_tying = weight_tying
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_labels = None
-        if self.use_labels:
-            token_labels = ids_tensor(
-                [self.batch_size, self.seq_length], self.num_labels
-            )
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, token_labels
-
-    def get_config(self):
-        return GPTNeoXJapaneseConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_multiple_size=self.intermediate_multiple_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout=self.hidden_dropout,
-            attention_dropout=self.attention_dropout,
-            weight_tying=self.weight_tying,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        config, input_ids, input_mask, token_labels = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-
-        return config, input_ids, input_mask, token_labels
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = GPTNeoXJapaneseModel(config=config)
-        model.set_train(False)
-        _ = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-
-    def create_and_check_model_as_decoder(self, config, input_ids, input_mask):
-        config.add_cross_attention = True
-        model = GPTNeoXJapaneseModel(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-
-    def create_and_check_for_causal_lm(
-        self, config, input_ids, input_mask, token_labels
-    ):
-        model = GPTNeoXJapaneseForCausalLM(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)
-        )
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self, config, input_ids, input_mask
-    ):
-        config.is_decoder = True
-        model = GPTNeoXJapaneseForCausalLM(config=config)
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=input_mask, use_cache=True)
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat((input_ids, next_tokens), axis=-1)
-        next_attention_mask = ops.cat((input_mask, next_mask), axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            output_hidden_states=True,
-        )
-        output_from_no_past = output_from_no_past["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(
-            np.allclose(
-                output_from_past_slice.asnumpy(),
-                output_from_no_past_slice.asnumpy(),
-                atol=1e-3,
-            )
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask, token_labels = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class GPTNeoXModelJapaneseTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (GPTNeoXJapaneseModel, GPTNeoXJapaneseForCausalLM)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (GPTNeoXJapaneseForCausalLM,) if is_mindspore_available() else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": GPTNeoXJapaneseModel,
-            "text-generation": GPTNeoXJapaneseForCausalLM,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_missing_keys = False
-    test_model_parallel = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = GPTNeoXJapaneseModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=GPTNeoXJapaneseConfig, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config, input_ids, input_mask, token_labels = (
-            self.model_tester.prepare_config_and_inputs()
-        )
-        self.model_tester.create_and_check_model(config, input_ids, input_mask)
-
-    def test_model_as_decoder(self):
-        config, input_ids, input_mask, token_labels = (
-            self.model_tester.prepare_config_and_inputs_for_decoder()
-        )
-        self.model_tester.create_and_check_model_as_decoder(
-            config, input_ids, input_mask
-        )
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        config, input_ids, input_mask, token_labels = (
-            self.model_tester.prepare_config_and_inputs_for_decoder()
-        )
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config, input_ids, input_mask
-        )
-
-    def test_decoder_model_past_large_inputs(self):
-        config, input_ids, input_mask, token_labels = (
-            self.model_tester.prepare_config_and_inputs()
-        )
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(
-            config, input_ids, input_mask
-        )
-
-    def test_model_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    @slow
-    def test_generation(self):
-        model_id = "abeja/gpt-neox-japanese-2.7b"
-
-        prompts = ["データサイエンティストとは、", "100年後に必要とされる会社は、", "フルリモートの環境で働くために必要なことは、", "国境の長いトンネルを抜けると", "美味しい日本食といえば、"]  # fmt: skip
-
-        EXPECTED_OUTPUTS = [
-            "データサイエンティストとは、データを分析し、ビジネスに役立つ知見を導き出す専門家のことです。",
-            "100年後に必要とされる会社は、「人」が中心の会社です。",
-            "フルリモートの環境で働くために必要なことは、「自分の時間をコントロールする」ことです。",
-            "国境の長いトンネルを抜けると、そこは雪国だった。",
-            "美味しい日本食といえば、やっぱりお寿司ですよね。",
-        ]
-
-        tokenizer = GPTNeoXJapaneseTokenizer.from_pretrained(model_id)
-        model = GPTNeoXJapaneseForCausalLM.from_pretrained(model_id)
-
-        predicted_outputs = []
-        for prompt in prompts:
-            input_ids = tokenizer(prompt, return_tensors="ms").input_ids
-            generated_ids = model.generate(input_ids, max_length=50)
-            generated_string = tokenizer.batch_decode(
-                generated_ids, skip_special_tokens=True
-            )
-            predicted_outputs += generated_string
-        self.assertListEqual(predicted_outputs, EXPECTED_OUTPUTS)
-
-    @unittest.skip("GPTNeoXJapanese applies bias to attention scores")
-    def test_custom_4d_attention_mask(self):
-        pass
diff --git a/tests/transformers/models/gptj/__init__.py b/tests/transformers/models/gptj/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/gptj/test_modeling_gptj.py b/tests/transformers/models/gptj/test_modeling_gptj.py
deleted file mode 100644
index 0ca1ef0fb..000000000
--- a/tests/transformers/models/gptj/test_modeling_gptj.py
+++ /dev/null
@@ -1,601 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import datetime
-import unittest
-
-from mindnlp.transformers import GPTJConfig, is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-    tooslow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, manual_seed
-
-    from mindnlp.transformers import (
-        AutoTokenizer,
-        GPTJForCausalLM,
-        GPTJForQuestionAnswering,
-        GPTJForSequenceClassification,
-        GPTJModel,
-    )
-
-class GPTJModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_token_type_ids=True,
-        use_input_mask=True,
-        use_labels=True,
-        use_mc_token_ids=True,
-        vocab_size=99,
-        hidden_size=32,
-        rotary_dim=4,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.use_mc_token_ids = use_mc_token_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.rotary_dim = rotary_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = None
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-    def get_large_model_config(self):
-        return GPTJConfig.from_pretrained("EleutherAI/gpt-j-6B")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        mc_token_ids = None
-        if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return GPTJConfig(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            rotary_dim=self.rotary_dim,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def create_and_check_gptj_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPTJModel(config=config)
-        model.eval()
-
-        result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(len(result.past_key_values), config.n_layer)
-
-    def create_and_check_gptj_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPTJModel(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
-        outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_token_type_ids = ops.cat([token_type_ids, next_token_types], dim=-1)
-
-        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_gptj_model_attention_mask_past(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = GPTJModel(config=config)
-        model.eval()
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-        half_seq_length = self.seq_length // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_gptj_model_past_large_inputs(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-    ):
-        model = GPTJModel(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True)
-
-        output, past = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_token_type_ids = ops.cat([token_type_ids, next_token_types], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
-        )["last_hidden_state"]
-        output_from_past = model(
-            next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past
-        )["last_hidden_state"]
-        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = GPTJForCausalLM(config)
-        model.eval()
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_forward_and_backwards(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
-    ):
-        model = GPTJForCausalLM(config)
-        if gradient_checkpointing:
-            model.gradient_checkpointing_enable()
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        result.loss.backward()
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask}
-
-        return config, inputs_dict
-
-
-@require_mindspore
-class GPTJModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (GPTJModel, GPTJForCausalLM, GPTJForSequenceClassification, GPTJForQuestionAnswering)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (GPTJForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": GPTJModel,
-            "question-answering": GPTJForQuestionAnswering,
-            "text-classification": GPTJForSequenceClassification,
-            "text-generation": GPTJForCausalLM,
-            "zero-shot": GPTJForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = True
-    test_pruning = False
-    test_missing_keys = False
-    test_model_parallel = False
-    test_head_masking = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_case_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        image_processor_name,
-        feature_extractor_name,
-        processor_name,
-    ):
-        if (
-            pipeline_test_case_name == "QAPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
-            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
-            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
-            return True
-
-        return False
-
-    # special case for DoubleHeads model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = GPTJModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTJConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_gptj_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gptj_model(*config_and_inputs)
-
-    def test_gptj_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gptj_model_past(*config_and_inputs)
-
-    def test_gptj_model_att_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gptj_model_attention_mask_past(*config_and_inputs)
-
-    def test_gptj_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_gptj_model_past_large_inputs(*config_and_inputs)
-
-    def test_gptj_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    # def test_gptj_gradient_checkpointing(self):
-    #     config_and_inputs = self.model_tester.prepare_config_and_inputs()
-    #     self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
-
-    @tooslow
-    def test_batch_generation(self):
-        # Marked as @tooslow due to GPU OOM
-        model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", ms_dtype=mindspore.float16)
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision="float16")
-
-        tokenizer.padding_side = "left"
-
-        # Define PAD Token = EOS Token = 50256
-        tokenizer.pad_token = tokenizer.eos_token
-        model.config.pad_token_id = model.config.eos_token_id
-
-        # use different length sentences to test batching
-        sentences = [
-            "Hello, my dog is a little",
-            "Today, I",
-        ]
-
-        inputs = tokenizer(sentences, return_tensors="ms", padding=True)
-        input_ids = inputs["input_ids"]
-        token_type_ids = ops.cat(
-            [
-                ops.full((input_ids.shape[0], input_ids.shape[1] - 1), 0, dtype=input_ids.dtype),
-                ops.full((input_ids.shape[0], 1), 500, dtype=input_ids.dtype),
-            ],
-            dim=-1,
-        )
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"],
-        )
-
-        outputs_tt = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"],
-            token_type_ids=token_type_ids,
-        )
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="ms").input_ids
-        output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
-        inputs_padded = tokenizer(sentences[1], return_tensors="ms").input_ids
-        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "Hello, my dog is a little over a year old and has been diagnosed with a heart murmur",
-            "Today, I’m going to talk about the most important thing in the",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
-        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "EleutherAI/gpt-j-6B"
-        model = GPTJModel.from_pretrained(model_name, revision="float16", ms_dtype=mindspore.float16)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class GPTJModelLanguageGenerationTest(unittest.TestCase):
-    @tooslow
-    def test_lm_generate_gptj(self):
-        # Marked as @tooslow due to GPU OOM
-        for checkpointing in [True, False]:
-            model = GPTJForCausalLM.from_pretrained(
-                "EleutherAI/gpt-j-6B", revision="float16", ms_dtype=mindspore.float16
-            )
-            if checkpointing:
-                model.gradient_checkpointing_enable()
-            else:
-                model.gradient_checkpointing_disable()
-                input_ids = mindspore.tensor([[464, 3290]], dtype=mindspore.int64)  # The dog
-            # The dog is a man's best friend. It is a loyal companion, and it is a friend
-            expected_output_ids = [464, 3290, 318, 257, 582, 338, 1266, 1545, 13, 632, 318, 257, 9112, 15185, 11, 290, 340, 318, 257, 1545]  # fmt: skip
-            output_ids = model.generate(input_ids, do_sample=False)
-            self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
-
-    @tooslow
-    def test_gptj_sample(self):
-        # Marked as @tooslow due to GPU OOM (issue #13676)
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", revision="float16")
-        model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", ms_dtype=mindspore.float16)
-
-        manual_seed(0)
-        tokenized = tokenizer("Today is a nice day and", return_tensors="ms", return_token_type_ids=True)
-        input_ids = tokenized.input_ids
-        output_ids = model.generate(input_ids, do_sample=True)
-        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-
-        token_type_ids = tokenized.token_type_ids
-        output_seq = model.generate(input_ids=input_ids, do_sample=True, num_return_sequences=5)
-        output_seq_tt = model.generate(
-            input_ids=input_ids, token_type_ids=token_type_ids, do_sample=True, num_return_sequences=5
-        )
-        output_seq_strs = tokenizer.batch_decode(output_seq, skip_special_tokens=True)
-        output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt, skip_special_tokens=True)
-
-
-        EXPECTED_OUTPUT_STR = "Today is a nice day and one of those days that feels a bit more alive. I am ready"
-
-        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
-        self.assertTrue(
-            all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs)))
-        )  # token_type_ids should change output
-
-    @slow
-    def test_gptj_sample_max_time(self):
-        tokenizer = AutoTokenizer.from_pretrained("anton-l/gpt-j-tiny-random")
-        model = GPTJForCausalLM.from_pretrained("anton-l/gpt-j-tiny-random")
-
-        manual_seed(0)
-        tokenized = tokenizer("Today is a nice day and", return_tensors="ms", return_token_type_ids=True)
-        input_ids = tokenized.input_ids
-
-        MAX_TIME = 0.5
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=True, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, num_beams=2, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=True, num_beams=2, max_time=MAX_TIME, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-        self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-        start = datetime.datetime.now()
-        model.generate(input_ids, do_sample=False, max_time=None, max_length=256)
-        duration = datetime.datetime.now() - start
-        self.assertGreater(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-    @tooslow
-    def test_contrastive_search_gptj(self):
-        article = (
-            "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and "
-            "research laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based"
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
-        model = GPTJForCausalLM.from_pretrained(
-            "EleutherAI/gpt-j-6B", revision="float16", ms_dtype=mindspore.float16
-        )
-        input_ids = tokenizer(article, return_tensors="ms").input_ids
-
-        outputs = model.generate(input_ids, penalty_alpha=0.6, top_k=4, max_length=256)
-        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "DeepMind Technologies is a British artificial intelligence subsidiary of Alphabet Inc. and research "
-                "laboratory founded in 2010. DeepMind was acquired by Google in 2014. The company is based in London, "
-                "United Kingdom with offices in Mountain View, San Francisco, New York City, Paris, Tokyo, Seoul, "
-                "Beijing, Singapore, Tel Aviv, Dublin, Sydney, and Melbourne.[1]\n\nContents\n\nIn 2010, Google's "
-                "parent company, Alphabet, announced a $500 million investment in DeepMind, with the aim of creating "
-                "a company that would apply deep learning to problems in healthcare, energy, transportation, and "
-                "other areas.[2]\n\nOn April 23, 2014, Google announced that it had acquired DeepMind for $400 "
-                "million in cash and stock.[3] The acquisition was seen as a way for Google to enter the "
-                "fast-growing field of artificial intelligence (AI), which it had so far avoided due to concerns "
-                'about ethical and social implications.[4] Google co-founder Sergey Brin said that he was "thrilled" '
-                'to have acquired DeepMind, and that it would "help us push the boundaries of AI even further."'
-                "[5]\n\nDeepMind's founders, Demis Hassabis and Mustafa Suleyman, were joined by a number of Google "
-                "employees"
-            ],
-        )
\ No newline at end of file
diff --git a/tests/transformers/models/gptsan_japanese/__init__.py b/tests/transformers/models/gptsan_japanese/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/gptsan_japanese/test_modeling_gptsan_japanese.py b/tests/transformers/models/gptsan_japanese/test_modeling_gptsan_japanese.py
deleted file mode 100644
index 0b07ce316..000000000
--- a/tests/transformers/models/gptsan_japanese/test_modeling_gptsan_japanese.py
+++ /dev/null
@@ -1,509 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Toshiyuki Sakamoto(tanreinama) and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-import numpy as np
-import mindspore
-from mindspore import Tensor,ops
-from mindnlp.transformers.models.gptsan_japanese import (
-    GPTSanJapaneseConfig,
-    GPTSanJapaneseForConditionalGeneration,
-    GPTSanJapaneseModel,
-    GPTSanJapaneseTokenizer,
-)
-from mindnlp.transformers.generation import GenerationConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, tooslow, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-class GPTSanJapaneseTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        num_contexts=7,
-        # For common tests
-        is_training=True,
-        hidden_size=32,
-        ext_size=42,
-        num_hidden_layers=2,
-        num_ext_layers=2,
-        num_attention_heads=4,
-        num_experts=2,
-        d_ff=32,
-        d_ext=80,
-        d_spout=33,
-        dropout_rate=0.0,
-        layer_norm_epsilon=1e-6,
-        expert_capacity=100,
-        router_jitter_noise=0.0,
-    ):
-        self.vocab_size = vocab_size
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_contexts = num_contexts
-        # For common tests
-        self.seq_length = self.num_contexts
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_ext_layers = num_ext_layers
-        self.ext_size = ext_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_experts = num_experts
-        self.d_ff = d_ff
-        self.d_ext = d_ext
-        self.d_spout = d_spout
-        self.dropout_rate = dropout_rate
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.expert_capacity = expert_capacity
-        self.router_jitter_noise = router_jitter_noise
-
-    def get_large_model_config(self):
-        return GPTSanJapaneseConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).astype(mindspore.int32)
-
-        config = self.get_config()
-
-        return (config, input_ids)
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).astype(mindspore.int32)
-
-        config = self.get_config()
-
-        return (config, {"input_ids": input_ids})
-
-    def get_config(self):
-        return GPTSanJapaneseConfig(
-            vocab_size=self.vocab_size,
-            num_contexts=self.seq_length,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_ext=self.d_ext,
-            d_spout=self.d_spout,
-            num_switch_layers=self.num_hidden_layers - self.num_ext_layers,
-            num_ext_layers=self.num_ext_layers,
-            num_heads=self.num_attention_heads,
-            num_experts=self.num_experts,
-            expert_capacity=self.expert_capacity,
-            dropout_rate=self.dropout_rate,
-            layer_norm_epsilon=self.layer_norm_epsilon,
-            router_jitter_noise=self.router_jitter_noise,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-    ):
-        model = GPTSanJapaneseForConditionalGeneration(config=config)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-        )
-        self.parent.assertIsNotNone(result)
-
-
-@require_mindspore
-class GPTSanJapaneseTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (GPTSanJapaneseModel,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "conversational": GPTSanJapaneseForConditionalGeneration,
-            "feature-extraction": GPTSanJapaneseForConditionalGeneration,
-            "summarization": GPTSanJapaneseForConditionalGeneration,
-            "text2text-generation": GPTSanJapaneseForConditionalGeneration,
-            "translation": GPTSanJapaneseForConditionalGeneration,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = False
-    is_encoder_decoder = False
-    test_pruning = False
-    test_headmasking = False
-    test_cpu_offload = False
-    test_disk_offload = False
-
-    test_save_load_fast_init_to_base = False
-    test_training = False
-    # The small GPTSAN_JAPANESE model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.8, 0.9]
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "SummarizationPipelineTests":
-            # TODO: fix `_reorder_cache` is not implemented for this model
-            return True
-        elif pipeline_test_casse_name == "Text2TextGenerationPipelineTests":
-            # TODO: check this.
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = GPTSanJapaneseTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTSanJapaneseConfig, d_model=37)
-
-    def test_config(self):
-        GPTSanJapaneseConfig()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(
-        reason="skip for now as the computed `max_memory` by `model_split_percents` in the test method will be changed inside `from_pretrained`"
-    )
-    def test_model_parallelism(self):
-        super().test_model_parallelism()
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-
-@slow
-@require_mindspore
-class GPTSanJapaneseForConditionalGenerationTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (GPTSanJapaneseForConditionalGeneration,) if is_mindspore_available() else ()
-    fx_compatible = False
-    is_encoder_decoder = False
-    test_pruning = False
-    test_headmasking = False
-    test_cpu_offload = False
-    test_disk_offload = False
-    # The small GPTSAN_JAPANESE model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = GPTSanJapaneseTester(self)
-        self.config_tester = ConfigTester(self, config_class=GPTSanJapaneseConfig, d_model=37)
-
-    def test_config(self):
-        GPTSanJapaneseConfig()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(
-        reason="skip for now as the computed `max_memory` by `model_split_percents` in the test method will be changed inside `from_pretrained`"
-    )
-    def test_model_parallelism(self):
-        super().test_model_parallelism()
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Gptsan does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    def test_logits(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        for param in model.get_parameters():
-            
-            # 如果参数的类型不是 Float32，转换为 Float32
-            if param.dtype != mindspore.float32:
-                param.set_dtype(mindspore.float32)
-        model.to(dtype=mindspore.float32)
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        input_ids = tokenizer.encode("武田信玄は", return_tensors="ms").astype(mindspore.int32)
-        outputs = model(input_ids)
-        output_logits = outputs.logits.asnumpy()
-        # Output of original model created with mesh-tensoflow
-        target = [
-            [-12.037839889526367, -12.433061599731445, -14.333840370178223, -12.450345993041992, -11.1661376953125,
-            -11.930137634277344, -10.659740447998047, -12.909574508666992, -13.241043090820312, -13.398579597473145,
-            -11.107524871826172, -12.3685941696167, -22.97943115234375, -10.481067657470703, -12.484030723571777,
-            -12.807360649108887, -14.769700050354004, -12.233579635620117, -13.428145408630371, -22.624177932739258],
-            [-7.511149883270264, -8.281851768493652, -7.943127155303955, -7.55021333694458, -6.49869966506958,
-            -7.586796283721924, -6.978085994720459, -7.839145183563232, -8.21964168548584, -8.695091247558594,
-            -6.706910610198975, -6.6585798263549805, -19.565698623657227, -5.353842735290527, -8.350686073303223,
-            -8.039388656616211, -10.856569290161133, -7.75154447555542, -8.819022178649902, -19.51532745361328],
-            [-9.73066234588623, -10.223922729492188, -9.932981491088867, -11.857836723327637, -7.662626266479492,
-            -11.13529109954834, -7.765097618103027, -11.472923278808594, -9.543149948120117, -11.905633926391602,
-            -9.366164207458496, -11.5734281539917, -23.699003219604492, -9.429590225219727, -10.42839241027832,
-            -10.585240364074707, -10.94771957397461, -11.095416069030762, -10.390240669250488, -23.769372940063477],
-            [-9.728265762329102, -9.859712600708008, -10.09729290008545, -9.678522109985352, -6.879519939422607,
-            -9.68487548828125, -4.2803425788879395, -10.018914222717285, -9.308445930480957, -10.63394546508789,
-            -8.083646774291992, -9.06301498413086, -21.904266357421875, -8.90160846710205, -8.841876029968262,
-            -11.856719970703125, -12.079398155212402, -11.233753204345703, -10.177338600158691, -21.87256622314453],
-            [-9.669764518737793, -9.614198684692383, -9.814510345458984, -9.996501922607422, -11.375690460205078,
-            -10.113405227661133, -10.546867370605469, -10.04369068145752, -10.907809257507324, -10.504216194152832,
-            -11.129199028015137, -10.151124000549316, -21.96586799621582, -9.086349487304688, -11.730339050292969,
-            -10.460667610168457, -10.298049926757812, -10.784148216247559, -10.840693473815918, -22.03152847290039],
-        ]
-
-        target = np.array(target).flatten()
-        predict = output_logits[0, :, :20].flatten()
-
-        def check(a, b, epsilon=5e-3):
-            return abs(a - b) < epsilon * max(abs(a), abs(b))
-
-        self.assertTrue(np.all([check(target[i], predict[i]) for i in range(len(target))]))
-
-    def test_batch_generation(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        for param in model.get_parameters():
-                # 如果参数的类型不是 Float32，转换为 Float32
-                if param.dtype != mindspore.float32:
-                    param.set_dtype(mindspore.float32)
-        model.to(dtype=mindspore.float32)
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        # set deterministically
-        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-        generation_config.top_k = 1
-
-        # use different length sentences to test batching
-        sentences = [
-            "甲斐なら武田と言うほど",
-            "織田信長は、",
-        ]
-
-        tokenizer.padding_side = "left"
-        inputs = tokenizer(sentences, return_tensors="ms", padding=True)
-        input_ids = inputs["input_ids"].astype(mindspore.int64)  
-
-        self.assertNotEqual(inputs["attention_mask"][0].asnumpy().tolist(), inputs["attention_mask"][1].asnumpy().tolist())
-
-        outputs = model.generate(
-            input_ids=input_ids.astype(mindspore.int64), 
-            attention_mask=inputs["attention_mask"], 
-            max_new_tokens=3,
-            generation_config=generation_config,
-        )
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="ms").input_ids.astype(mindspore.int64)
-        output_non_padded = model.generate(
-            input_ids=inputs_non_padded, max_new_tokens=3, generation_config=generation_config
-        )
-
-        inputs_padded = tokenizer(sentences[1], return_tensors="ms").input_ids.astype(mindspore.int64)
-        output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=3, generation_config=generation_config)
-
-        self.assertNotEqual(inputs_non_padded.shape, inputs_padded.shape)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "甲斐なら武田と言うほど甲斐の武田",
-            "織田信長は、このような",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [non_padded_sentence, padded_sentence])
-
-
-    @tooslow
-    def test_sample(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-
-        for block in model.model.blocks:
-            block.self_attn.self_attn.q_proj.weight.set_dtype(mindspore.float32)
-            block.self_attn.self_attn.k_proj.weight.set_dtype(mindspore.float32)
-            block.self_attn.self_attn.v_proj.weight.set_dtype(mindspore.float32)
-            block.self_attn.self_attn.out_proj.weight.set_dtype(mindspore.float32)
-
-        model.to(dtype=mindspore.float32)
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        # Output of original model created with mesh-tensoflow
-        target = [
-            ("武田信玄は", 35675),
-            ("武田信玄は、", 45),
-            ("武田信玄は、この", 29),
-            ("武田信玄は、このよう", 30642),
-            ("武田信玄は、このような", 35680),
-            ("武田信玄は、このような「", 8640),
-            ("武田信玄は、このような「武田", 31617),
-            ("武田信玄は、このような「武田家", 30646),
-            ("武田信玄は、このような「武田家の", 31617),
-            ("武田信玄は、このような「武田家の家", 31381),
-        ]
-        for input, output in target:
-            input_ids = tokenizer.encode(input, return_tensors="ms")
-            outputs = model(input_ids)
-            output_logits = outputs.logits.detach().cpu().numpy()[0]
-            output_id = np.argmax(output_logits[-1])
-            self.assertEqual(output_id, output)
-
-
-    def test_spout_generation(self):
-        # Load model and convert to float32 type
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        for param in model.get_parameters():
-                # 如果参数的类型不是 Float32，转换为 Float32
-                if param.dtype != mindspore.float32:
-                    param.set_dtype(mindspore.float32)
-                    
-        # Set generation config
-        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-        generation_config.top_k = 1
-
-        input_text = "武田信玄は、"
-        input_ids = tokenizer(input_text, return_tensors="ms").input_ids.astype(mindspore.int64)  # Ensure input_ids are int32
-        input_ids_batch = tokenizer([input_text, input_text], return_tensors="ms").input_ids.astype(mindspore.int64) 
-
-        # Use uniform distribution and one-hot vector for spout
-        spouts = [
-            Tensor([0.87882208, 0.38426396, 0.33220248, 0.43890406, 0.16562252,
-            0.04803985, 0.211572  , 0.23188473, 0.37153068, 0.7836377 ,
-            0.02160172, 0.38761719, 0.75290772, 0.90198857, 0.34365777,
-            0.64168169, 0.44318471, 0.14575746, 0.92562881, 0.40812148,
-            0.29019122, 0.88861599, 0.65524846, 0.43563456, 0.38177187,
-            0.70832965, 0.81527892, 0.68832812, 0.38833192, 0.4561522 ,
-            0.14828817, 0.47248213, 0.54357335, 0.82009566, 0.1338884 ,
-            0.02755417, 0.19764677, 0.2422084 , 0.04757674, 0.65409606,
-            0.0824589 , 0.03304383, 0.94387689, 0.98764509, 0.82433901,
-            0.27646741, 0.64907493, 0.76009406, 0.30087915, 0.17904689,
-            0.41601714, 0.67046398, 0.10422822, 0.08447374, 0.07354344,
-            0.61423565, 0.70284866, 0.7532333 , 0.1972038 , 0.29575659,
-            0.90583886, 0.29265307, 0.50000175, 0.70407655, 0.889363  ,
-            0.81904418, 0.66829128, 0.64468815, 0.56563723, 0.85601875,
-            0.94924672, 0.00166762, 0.25220643, 0.74540219, 0.67993247,
-            0.1549675 , 0.39385352, 0.92153607, 0.63745931, 0.27759043,
-            0.84702295, 0.65904271, 0.58676614, 0.8666936 , 0.39607438,
-            0.79954983, 0.42220697, 0.39650381, 0.7849864 , 0.56150201,
-            0.15678925, 0.14746032, 0.34542114, 0.47026783, 0.11956489,
-            0.25421435, 0.33788901, 0.68934842, 0.36424685, 0.71737898,
-            0.38983449, 0.94393779, 0.39575588, 0.36616553, 0.87104665,
-            0.64630203, 0.22516905, 0.88270804, 0.15031338, 0.75144345,
-            0.46459025, 0.85396454, 0.86355643, 0.65139851, 0.70266061,
-            0.30241389, 0.81056497, 0.88865969, 0.38773807, 0.70635849,
-            0.90718459, 0.43245789, 0.28000654, 0.45935562, 0.08773519,
-            0.9552151 , 0.93901511, 0.22489288], mindspore.float32),  # Ensure spouts are float32
-            Tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-            0., 0., 0., 0., 0., 0., 0., 0.], mindspore.float32),  # Ensure spouts are float32
-        ]  # fmt: skip
-
-        for i, spout in enumerate(spouts):
-            if spout.ndim == 1:  
-                spouts[i] = spout.expand_dims(0)
-        
-        spout_tensor = ops.stack(spouts)
-        output1 = model.generate(
-            input_ids=input_ids,
-            spout=spouts[0],
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output2 = model.generate(
-            input_ids=input_ids,
-            spout=spouts[1],
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output3 = model.generate(
-            input_ids=input_ids_batch,
-            spout=spout_tensor,
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        out1_sentence = tokenizer.decode(output1[0])
-        out2_sentence = tokenizer.decode(output2[0])
-        batch_out_sentence = tokenizer.batch_decode(output3)
-
-        expected_output_sentence = [
-            "武田信玄は、武田氏の滅亡後、武田氏の居城であった甲斐武田氏の居城である",
-            "武田信玄は、武田家の滅亡を防ぐため、武田家の家臣である武田信虎を討",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [out1_sentence, out2_sentence])
-
-    def test_prefix_lm_generation(self):
-        model = GPTSanJapaneseForConditionalGeneration.from_pretrained("Tanrei/GPTSAN-japanese")
-        tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
-        model.to(dtype=mindspore.float32)
-        for param in model.get_parameters():
-            # 如果参数的类型不是 Float32，转换为 Float32
-            if param.dtype != mindspore.float32:
-                param.set_dtype(mindspore.float32)
-
-        # set deterministically
-        generation_config = GenerationConfig.from_pretrained("Tanrei/GPTSAN-japanese")
-        generation_config.top_k = 1
-
-        prefix_text_1 = "武田信玄"
-        prefix_text_2 = "織田信長"
-        input_text_1 = "は、"
-        input_text_2 = "が、"
-
-        input_tok_1 = tokenizer(input_text_1, prefix_text=prefix_text_1, return_tensors="ms")
-        input_tok_2 = tokenizer(input_text_2, prefix_text=prefix_text_2, return_tensors="ms")
-        input_tok_3 = tokenizer([[prefix_text_1, input_text_1], [prefix_text_2, input_text_2]], return_tensors="ms")
-
-        output1 = model.generate(
-            input_ids=input_tok_1.input_ids,
-            token_type_ids=input_tok_1.token_type_ids,
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output2 = model.generate(
-            input_ids=input_tok_2.input_ids,
-            token_type_ids=input_tok_2.token_type_ids,
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        output3 = model.generate(
-            input_ids=input_tok_3.input_ids,
-            token_type_ids=input_tok_3.token_type_ids,
-            attention_mask=input_tok_3.attention_mask,
-            max_new_tokens=20,
-            generation_config=generation_config,
-        )
-
-        out1_sentence = tokenizer.decode(output1[0])
-        out2_sentence = tokenizer.decode(output2[0])
-        batch_out_sentence = tokenizer.batch_decode(output3)
-
-        expected_output_sentence = [
-            "武田信玄は、武田氏の祖である武田信虎を、その子・武田信友を擁して",
-            "織田信長が、織田信長の妻・お市の方を妻として迎えたという逸話が残",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [out1_sentence, out2_sentence])
diff --git a/tests/transformers/models/graphormer/__init__.py b/tests/transformers/models/graphormer/__init__.py
deleted file mode 100644
index fa40af140..000000000
--- a/tests/transformers/models/graphormer/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Test Graphormer
-"""
diff --git a/tests/transformers/models/graphormer/test_graphormer_cells.py b/tests/transformers/models/graphormer/test_graphormer_cells.py
deleted file mode 100644
index c93b48caa..000000000
--- a/tests/transformers/models/graphormer/test_graphormer_cells.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Test Graphormer Cell
-"""
-import unittest
-import numpy as np
-import mindspore as ms
-
-
-from mindnlp.transformers.models.graphormer.modeling_graphormer import (
-    GraphormerMultiheadAttention,
-    GraphormerGraphEncoderLayer,
-    GraphormerGraphEncoder)
-from mindnlp.transformers.models.graphormer.configuration_graphormer import (
-    GraphormerConfig)
-
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-from ...test_configuration_common import ConfigTester
-from .test_modeling_graphormer import GraphormerModelTester
-
-
-class GraphormerMultiheadAttentionTest(ModelTesterMixin, unittest.TestCase):
-    def setUp(self):
-        self.model_tester = GraphormerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GraphormerConfig, has_text_modality=False)
-
-    def prepare_config_and_inputs(self, batch_size: int = 10):
-        config = self.model_tester.get_config()
-        mdt = self.model_tester
-        query = floats_tensor([mdt.graph_size + 1, mdt.batch_size, mdt.embedding_dim])
-        attn_bias = ids_tensor(
-            [mdt.batch_size * mdt.num_attention_heads, mdt.graph_size + 1, mdt.graph_size + 1],
-            config.num_atoms * 2 + 1)
-        key_padding_mask = ms.tensor(np.full((mdt.batch_size, mdt.graph_size + 1), False))
-        inputs = dict(query=query,
-                      key=query,
-                      value=query,
-                      attn_bias=attn_bias,
-                      key_padding_mask=key_padding_mask,
-                      need_weights=False,
-                      attn_mask=None)
-
-        return config, inputs
-
-    def test_model(self):
-        config, inputs = self.prepare_config_and_inputs()
-        model = GraphormerMultiheadAttention(config)
-        result = model(**inputs)
-        self.assertEqual(
-            result[0].shape, (self.model_tester.graph_size + 1,
-                              self.model_tester.batch_size,
-                              self.model_tester.hidden_size)
-        )
-
-
-class GraphormerGraphEncoderLayerTest(ModelTesterMixin, unittest.TestCase):
-    def setUp(self):
-        self.model_tester = GraphormerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GraphormerConfig, has_text_modality=False)
-
-    def prepare_config_and_inputs(self, batch_size: int = 10):
-        config = self.model_tester.get_config()
-        mdt = self.model_tester
-        input_nodes = floats_tensor([mdt.graph_size + 1, mdt.batch_size, mdt.embedding_dim])
-        self_attn_bias = ids_tensor([mdt.batch_size,
-                                     mdt.num_attention_heads,
-                                     mdt.graph_size + 1,
-                                     mdt.graph_size + 1],config.num_atoms * 2 + 1)
-        self_attn_padding_mask = ms.tensor(np.full((mdt.batch_size, mdt.graph_size + 1), False))
-        inputs = dict(input_nodes = input_nodes,
-                      self_attn_bias = self_attn_bias,
-                      self_attn_mask = None,
-                      self_attn_padding_mask = self_attn_padding_mask)
-        return config, inputs
-
-    def test_model(self):
-        config, inputs = self.prepare_config_and_inputs()
-        model = GraphormerGraphEncoderLayer(config)
-        result = model(**inputs)
-        self.assertEqual(
-            result[0].shape, (self.model_tester.graph_size + 1,
-                              self.model_tester.batch_size,
-                              self.model_tester.hidden_size)
-        )
-
-
-class GraphormerGraphEncoderTest(ModelTesterMixin, unittest.TestCase):
-    def setUp(self):
-        self.model_tester = GraphormerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GraphormerConfig,
-                                          has_text_modality=False)
-
-    def prepare_config_and_inputs(self, batch_size: int = 10):
-        config = self.model_tester.get_config()
-        mdt = self.model_tester
-        input_nodes = ids_tensor([mdt.batch_size, mdt.graph_size, 1],
-                                 mdt.num_atoms)
-        input_edges = ids_tensor([mdt.batch_size,
-                                  mdt.graph_size,
-                                  mdt.graph_size,
-                                  mdt.multi_hop_max_dist, 1], mdt.num_edges)
-        attn_bias = ids_tensor([mdt.batch_size,
-                                mdt.graph_size + 1,
-                                mdt.graph_size + 1], mdt.num_atoms)
-        in_degree = ids_tensor([mdt.batch_size, mdt.graph_size], mdt.num_in_degree)
-        out_degree = ids_tensor([mdt.batch_size, mdt.graph_size], mdt.num_out_degree)
-        spatial_pos = ids_tensor([mdt.batch_size, mdt.graph_size, mdt.graph_size], mdt.num_spatial)
-        attn_edge_type = ids_tensor([mdt.batch_size, mdt.graph_size, mdt.graph_size, 1], mdt.num_edges)
-
-
-        self_attn_bias = ids_tensor([mdt.batch_size,
-                                     mdt.num_attention_heads,
-                                     mdt.graph_size + 1,
-                                     mdt.graph_size + 1],config.num_atoms * 2 + 1)
-        self_attn_padding_mask = ms.tensor(np.full((mdt.batch_size, mdt.graph_size + 1), False))
-        inputs = dict(input_nodes=input_nodes,
-                      input_edges=input_edges,
-                      attn_bias=attn_bias,
-                      in_degree=in_degree,
-                      out_degree=out_degree,
-                      spatial_pos=spatial_pos,
-                      attn_edge_type=attn_edge_type)
-        return config, inputs
-
-    def test_model(self):
-        config, inputs = self.prepare_config_and_inputs()
-        model = GraphormerGraphEncoder(config)
-        inner_states, graph_rep = model(**inputs)
-        # what about layerdrop?
-        self.assertEqual(len(inner_states), self.model_tester.num_hidden_layers+1)
-        # difference beteween hidden_size and embedding_dim?
-        self.assertEqual(inner_states[0].shape, (self.model_tester.graph_size + 1,
-                                               self.model_tester.batch_size,
-                                               self.model_tester.embedding_dim))
-        self.assertEqual(graph_rep.shape, (self.model_tester.batch_size,
-                                           self.model_tester.embedding_dim))
diff --git a/tests/transformers/models/graphormer/test_modeling_graphormer.py b/tests/transformers/models/graphormer/test_modeling_graphormer.py
deleted file mode 100644
index dfdb59a6a..000000000
--- a/tests/transformers/models/graphormer/test_modeling_graphormer.py
+++ /dev/null
@@ -1,1206 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Test Graphormer
-"""
-import unittest
-import copy
-import inspect
-import os
-import tempfile
-import unittest
-import numpy as np
-
-from mindnlp.utils import is_mindspore_available
-from mindnlp.transformers.models.graphormer.configuration_graphormer import GraphormerConfig
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
-
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow
-)
-
-if is_mindspore_available():
-    from mindspore import tensor
-    from mindnlp.transformers.models.graphormer.modeling_graphormer import(
-        GraphormerModel,
-        GraphormerForGraphClassification,
-        GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST)
-
-class GraphormerModelTester:
-    def __init__(
-        self,
-        parent,
-        num_classes=1,
-        num_atoms=32 * 9,
-        num_edges=32 * 3,
-        num_in_degree=32,
-        num_out_degree=32,
-        num_spatial=32,
-        num_edge_dis=16,
-        multi_hop_max_dist=5,  # sometimes is 20
-        spatial_pos_max=32,
-        edge_type="multi_hop",
-        init_fn=None,
-        max_nodes=32,
-        share_input_output_embed=False,
-        num_hidden_layers=2,
-        embedding_dim=32,
-        ffn_embedding_dim=32,
-        num_attention_heads=4,
-        dropout=0.1,
-        attention_dropout=0.1,
-        activation_dropout=0.1,
-        layerdrop=0.0,
-        encoder_normalize_before=False,
-        pre_layernorm=False,
-        apply_graphormer_init=False,
-        activation_fn="gelu",
-        embed_scale=None,
-        freeze_embeddings=False,
-        num_trans_layers_to_freeze=0,
-        traceable=False,
-        q_noise=0.0,
-        qn_block_size=8,
-        kdim=None,
-        vdim=None,
-        bias=True,
-        self_attention=True,
-        batch_size=10,
-        graph_size=20,
-        is_training=True,
-    ):
-        self.parent = parent
-        self.num_classes = num_classes
-        self.num_labels = num_classes
-        self.num_atoms = num_atoms
-        self.num_in_degree = num_in_degree
-        self.num_out_degree = num_out_degree
-        self.num_edges = num_edges
-        self.num_spatial = num_spatial
-        self.num_edge_dis = num_edge_dis
-        self.edge_type = edge_type
-        self.multi_hop_max_dist = multi_hop_max_dist
-        self.spatial_pos_max = spatial_pos_max
-        self.max_nodes = max_nodes
-        self.num_hidden_layers = num_hidden_layers
-        self.embedding_dim = embedding_dim
-        self.hidden_size = embedding_dim
-        self.ffn_embedding_dim = ffn_embedding_dim
-        self.num_attention_heads = num_attention_heads
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.layerdrop = layerdrop
-        self.encoder_normalize_before = encoder_normalize_before
-        self.pre_layernorm = pre_layernorm
-        self.apply_graphormer_init = apply_graphormer_init
-        self.activation_fn = activation_fn
-        self.embed_scale = embed_scale
-        self.freeze_embeddings = freeze_embeddings
-        self.num_trans_layers_to_freeze = num_trans_layers_to_freeze
-        self.share_input_output_embed = share_input_output_embed
-        self.traceable = traceable
-        self.q_noise = q_noise
-        self.qn_block_size = qn_block_size
-        self.init_fn = init_fn
-        self.kdim = kdim
-        self.vdim = vdim
-        self.self_attention = self_attention
-        self.bias = bias
-        self.batch_size = batch_size
-        self.graph_size = graph_size
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        attn_bias = ids_tensor(
-            [self.batch_size, self.graph_size + 1, self.graph_size + 1], self.num_atoms
-        )  # Def not sure here
-        attn_edge_type = ids_tensor([self.batch_size, self.graph_size, self.graph_size, 1], self.num_edges)
-        spatial_pos = ids_tensor([self.batch_size, self.graph_size, self.graph_size], self.num_spatial)
-        in_degree = ids_tensor([self.batch_size, self.graph_size], self.num_in_degree)
-        out_degree = ids_tensor([self.batch_size, self.graph_size], self.num_out_degree)
-        input_nodes = ids_tensor([self.batch_size, self.graph_size, 1], self.num_atoms)
-        input_edges = ids_tensor(
-            [self.batch_size, self.graph_size, self.graph_size, self.multi_hop_max_dist, 1], self.num_edges
-        )
-        labels = ids_tensor([self.batch_size], self.num_classes)
-
-        config = self.get_config()
-        return config, attn_bias, attn_edge_type, spatial_pos, in_degree, out_degree, input_nodes, input_edges, labels
-
-    def get_config(self):
-        return GraphormerConfig(
-            num_atoms=self.num_atoms,
-            num_in_degree=self.num_in_degree,
-            num_out_degree=self.num_out_degree,
-            num_edges=self.num_edges,
-            num_spatial=self.num_spatial,
-            num_edge_dis=self.num_edge_dis,
-            edge_type=self.edge_type,
-            multi_hop_max_dist=self.multi_hop_max_dist,
-            spatial_pos_max=self.spatial_pos_max,
-            max_nodes=self.max_nodes,
-            num_hidden_layers=self.num_hidden_layers,
-            embedding_dim=self.embedding_dim,
-            hidden_size=self.embedding_dim,
-            ffn_embedding_dim=self.ffn_embedding_dim,
-            num_attention_heads=self.num_attention_heads,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            activation_dropout=self.activation_dropout,
-            layerdrop=self.layerdrop,
-            encoder_normalize_before=self.encoder_normalize_before,
-            pre_layernorm=self.pre_layernorm,
-            apply_graphormer_init=self.apply_graphormer_init,
-            activation_fn=self.activation_fn,
-            embed_scale=self.embed_scale,
-            freeze_embeddings=self.freeze_embeddings,
-            num_trans_layers_to_freeze=self.num_trans_layers_to_freeze,
-            share_input_output_embed=self.share_input_output_embed,
-            traceable=self.traceable,
-            q_noise=self.q_noise,
-            qn_block_size=self.qn_block_size,
-            init_fn=self.init_fn,
-            kdim=self.kdim,
-            vdim=self.vdim,
-            self_attention=self.self_attention,
-            bias=self.bias,
-        )
-
-    def create_and_check_model(
-        self, config, attn_bias, attn_edge_type, spatial_pos, in_degree, out_degree, input_nodes, input_edges, labels
-    ):
-        model = GraphormerModel(config=config)
-        model.set_train(False)
-        result = model(
-            input_nodes=input_nodes,
-            attn_bias=attn_bias,
-            in_degree=in_degree,
-            out_degree=out_degree,
-            spatial_pos=spatial_pos,
-            input_edges=input_edges,
-            attn_edge_type=attn_edge_type,
-            labels=labels,
-        )
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.graph_size + 1, self.hidden_size)
-        )
-
-    def create_and_check_for_graph_classification(
-        self, config, attn_bias, attn_edge_type, spatial_pos, in_degree, out_degree, input_nodes, input_edges, labels
-    ):
-        model = GraphormerForGraphClassification(config)
-        model.set_train(False)
-        result = model(
-            input_nodes=input_nodes,
-            attn_bias=attn_bias,
-            in_degree=in_degree,
-            out_degree=out_degree,
-            spatial_pos=spatial_pos,
-            input_edges=input_edges,
-            attn_edge_type=attn_edge_type,
-            labels=labels
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            attn_bias,
-            attn_edge_type,
-            spatial_pos,
-            in_degree,
-            out_degree,
-            input_nodes,
-            input_edges,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "attn_bias": attn_bias,
-            "attn_edge_type": attn_edge_type,
-            "spatial_pos": spatial_pos,
-            "in_degree": in_degree,
-            "out_degree": out_degree,
-            "input_nodes": input_nodes,
-            "input_edges": input_edges,
-            "labels": labels,
-        }
-        return config, inputs_dict
-
-@require_mindspore
-class GraphormerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (GraphormerForGraphClassification,)
-    all_generative_model_classes = ()
-    pipeline_model_mapping = {"feature-extraction": GraphormerModel}
-    test_pruning = False
-    test_head_masking = False
-    test_resize_embeddings = False
-    main_input_name_nodes = "input_nodes"
-    main_input_name_edges = "input_edges"
-    has_attentions = False  # does not output attention
-
-    def setUp(self):
-        self.model_tester = GraphormerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GraphormerConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="Graphormer does not use one single inputs_embedding but three")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Graphormer does not implement feed forward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip(reason="Graphormer does not share input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Graphormer does not share common arg names")
-    def test_forward_signature(self):
-        pass
-
-    def test_initialization(self):
-        def _config_zero_init(config):
-            configs_no_init = copy.deepcopy(config)
-            for key in configs_no_init.__dict__.keys():
-                if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
-                    setattr(configs_no_init, key, 1e-10)
-            return configs_no_init
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    self.assertTrue(
-                        -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            batch_size = self.model_tester.batch_size
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [batch_size, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            # Always returns hidden_states
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    @unittest.skip(reason="Skip the grad related tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    # Inputs are 'input_nodes' and 'input_edges' not 'input_ids'
-    def test_model_main_input_name(self):
-        for model_class in self.all_model_classes:
-            model_signature = inspect.signature(getattr(model_class, "forward"))
-            # The main input is the name of the argument after `self`
-            observed_main_input_name_nodes = list(model_signature.parameters.keys())[1]
-            observed_main_input_name_edges = list(model_signature.parameters.keys())[2]
-            self.assertEqual(model_class.main_input_name_nodes, observed_main_input_name_nodes)
-            self.assertEqual(model_class.main_input_name_edges, observed_main_input_name_edges)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_nodes", "input_edges"]
-            self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_graph_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_graph_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in GRAPHORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = GraphormerForGraphClassification.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-@require_mindspore
-class GraphormerModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_graph_classification(self):
-        model = GraphormerForGraphClassification.from_pretrained("clefourrier/graphormer-base-pcqm4mv2")
-
-        # Actual real graph data from the MUTAG dataset
-        # fmt: off
-        model_input = {
-            "attn_bias": tensor(
-                [
-                    [
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
-                    ],
-                    [
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, float("-inf"), float("-inf"), float("-inf"), float("-inf")],
-                    ],
-                ]
-            ),
-            "attn_edge_type": tensor(
-                [
-                    [
-                        [[0], [3], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [3], [0], [3], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [3], [0], [3], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[3], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [3], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [3], [0], [0], [0]],
-                        [[0], [0], [0], [3], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [3], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [3], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [3], [3]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0]],
-                    ],
-                    [
-                        [[0], [3], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0]],
-                        [[3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [3], [0], [3], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [3], [0], [0], [0], [3], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [3], [0], [3], [3], [0], [0], [0], [0], [0], [0]],
-                        [[3], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [3], [3], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [3], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                        [[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]],
-                    ],
-                ]
-            ),
-            # fmt: on
-            "spatial_pos": tensor(
-                [
-                    [
-                        [1, 2, 3, 4, 3, 2, 4, 5, 6, 5, 6, 7, 8, 7, 9, 10, 10],
-                        [2, 1, 2, 3, 4, 3, 5, 6, 5, 4, 5, 6, 7, 6, 8, 9, 9],
-                        [3, 2, 1, 2, 3, 4, 4, 5, 4, 3, 4, 5, 6, 5, 7, 8, 8],
-                        [4, 3, 2, 1, 2, 3, 3, 4, 3, 2, 3, 4, 5, 4, 6, 7, 7],
-                        [3, 4, 3, 2, 1, 2, 2, 3, 4, 3, 4, 5, 6, 5, 7, 8, 8],
-                        [2, 3, 4, 3, 2, 1, 3, 4, 5, 4, 5, 6, 7, 6, 8, 9, 9],
-                        [4, 5, 4, 3, 2, 3, 1, 2, 3, 4, 5, 6, 5, 4, 6, 7, 7],
-                        [5, 6, 5, 4, 3, 4, 2, 1, 2, 3, 4, 5, 4, 3, 5, 6, 6],
-                        [6, 5, 4, 3, 4, 5, 3, 2, 1, 2, 3, 4, 3, 2, 4, 5, 5],
-                        [5, 4, 3, 2, 3, 4, 4, 3, 2, 1, 2, 3, 4, 3, 5, 6, 6],
-                        [6, 5, 4, 3, 4, 5, 5, 4, 3, 2, 1, 2, 3, 4, 4, 5, 5],
-                        [7, 6, 5, 4, 5, 6, 6, 5, 4, 3, 2, 1, 2, 3, 3, 4, 4],
-                        [8, 7, 6, 5, 6, 7, 5, 4, 3, 4, 3, 2, 1, 2, 2, 3, 3],
-                        [7, 6, 5, 4, 5, 6, 4, 3, 2, 3, 4, 3, 2, 1, 3, 4, 4],
-                        [9, 8, 7, 6, 7, 8, 6, 5, 4, 5, 4, 3, 2, 3, 1, 2, 2],
-                        [10, 9, 8, 7, 8, 9, 7, 6, 5, 6, 5, 4, 3, 4, 2, 1, 3],
-                        [10, 9, 8, 7, 8, 9, 7, 6, 5, 6, 5, 4, 3, 4, 2, 3, 1],
-                    ],
-                    [
-                        [1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 4, 5, 5, 0, 0, 0, 0],
-                        [2, 1, 2, 3, 4, 5, 4, 3, 4, 3, 5, 6, 6, 0, 0, 0, 0],
-                        [3, 2, 1, 2, 3, 4, 3, 2, 3, 4, 4, 5, 5, 0, 0, 0, 0],
-                        [4, 3, 2, 1, 2, 3, 4, 3, 4, 5, 5, 6, 6, 0, 0, 0, 0],
-                        [5, 4, 3, 2, 1, 2, 3, 4, 5, 6, 6, 7, 7, 0, 0, 0, 0],
-                        [6, 5, 4, 3, 2, 1, 2, 3, 4, 5, 5, 6, 6, 0, 0, 0, 0],
-                        [5, 4, 3, 4, 3, 2, 1, 2, 3, 4, 4, 5, 5, 0, 0, 0, 0],
-                        [4, 3, 2, 3, 4, 3, 2, 1, 2, 3, 3, 4, 4, 0, 0, 0, 0],
-                        [3, 4, 3, 4, 5, 4, 3, 2, 1, 2, 2, 3, 3, 0, 0, 0, 0],
-                        [2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 3, 4, 4, 0, 0, 0, 0],
-                        [4, 5, 4, 5, 6, 5, 4, 3, 2, 3, 1, 2, 2, 0, 0, 0, 0],
-                        [5, 6, 5, 6, 7, 6, 5, 4, 3, 4, 2, 1, 3, 0, 0, 0, 0],
-                        [5, 6, 5, 6, 7, 6, 5, 4, 3, 4, 2, 3, 1, 0, 0, 0, 0],
-                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                    ],
-                ]
-            ),
-            "in_degree": tensor(
-                [
-                    [3, 3, 3, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 4, 2, 2],
-                    [3, 3, 4, 3, 3, 3, 3, 4, 4, 3, 4, 2, 2, 0, 0, 0, 0],
-                ]
-            ),
-            "out_degree": tensor(
-                [
-                    [3, 3, 3, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 4, 2, 2],
-                    [3, 3, 4, 3, 3, 3, 3, 4, 4, 3, 4, 2, 2, 0, 0, 0, 0],
-                ]
-            ),
-            "input_nodes": tensor(
-                [
-                    [[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3]],
-                    [[3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [0], [0], [0], [0]],
-                ]
-            ),
-            "input_edges": tensor(
-                [
-                    [
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                    ],
-                    [
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [4]],
-                            [[4], [4], [4], [4], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[4], [4], [4], [0], [0]],
-                            [[4], [0], [0], [0], [0]],
-                            [[4], [4], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                        [
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                            [[0], [0], [0], [0], [0]],
-                        ],
-                    ],
-                ]
-            ),
-            "labels": tensor([1, 0]),
-        }
-
-        output = model(**model_input)["logits"]
-
-        expected_shape = (2, 1)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_logs = tensor(
-            [[7.6060], [7.4126]]
-        )
-        self.assertTrue(np.allclose(output.asnumpy(), expected_logs.asnumpy(), atol=5e-3))
-
-    r"""
-    Test Graphormer
-    """
-    def setUp(self):
-        """
-        Set up.
-        """
-        self.config = GraphormerConfig(n_layer=2, vocab_size=1000,
-                                       n_embd=128, hidden_size=128,
-                                       n_head=8)
-
-    def test_graphormer_model(self):
-        r"""
-        Test GraphormerModel
-        """
-        model = GraphormerModel(self.config)
diff --git a/tests/transformers/models/groupvit/__init__.py b/tests/transformers/models/groupvit/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/groupvit/test_modeling_groupvit.py b/tests/transformers/models/groupvit/test_modeling_groupvit.py
deleted file mode 100644
index a9d89b8f3..000000000
--- a/tests/transformers/models/groupvit/test_modeling_groupvit.py
+++ /dev/null
@@ -1,589 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch GroupViT model."""
-# pylint: disable=missing-timeout
-import inspect
-import random
-import tempfile
-import unittest
-
-import numpy as np
-import requests
-
-from mindnlp.transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-    is_mindspore_available
-)
-from mindnlp.utils.import_utils import is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-    from mindnlp.transformers import GroupViTModel, GroupViTTextModel, GroupViTVisionModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import CLIPProcessor
-
-
-class GroupViTVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        depths=[6, 3, 3],
-        num_group_tokens=[64, 8, 0],
-        num_output_groups=[64, 8, 8],
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.depths = depths
-        self.num_hidden_layers = sum(depths)
-        self.expected_num_hidden_layers = len(depths) + 1
-        self.num_group_tokens = num_group_tokens
-        self.num_output_groups = num_output_groups
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        num_patches = (image_size // patch_size) ** 2
-        # no [CLS] token for GroupViT
-        self.seq_length = num_patches
-
-    def prepare_config_and_inputs(self):
-        rng = random.Random(0)
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size], rng=rng)
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return GroupViTVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            depths=self.depths,
-            num_group_tokens=self.num_group_tokens,
-            num_output_groups=self.num_output_groups,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = GroupViTVisionModel(config=config)
-
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.num_output_groups[-1], self.hidden_size)
-        )
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class GroupViTVisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as GROUPVIT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (GroupViTVisionModel,) if is_mindspore_available() else ()
-
-    test_pruning = False
-    test_torchscript = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = GroupViTVisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=GroupViTVisionConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="GroupViT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-
-        expected_num_attention_outputs = sum(g > 0 for g in self.model_tester.num_group_tokens)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            # GroupViT returns attention grouping of each stage
-            self.assertEqual(len(attentions), sum(g > 0 for g in self.model_tester.num_group_tokens))
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            # GroupViT returns attention grouping of each stage
-            self.assertEqual(len(attentions), expected_num_attention_outputs)
-
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            # GroupViT returns attention grouping of each stage
-            self.assertEqual(len(self_attentions), expected_num_attention_outputs)
-            for i, self_attn in enumerate(self_attentions):
-                if self_attn is None:
-                    continue
-
-                self.assertListEqual(
-                    list(self_attentions[i].shape[-2:]),
-                    [
-                        self.model_tester.num_output_groups[i],
-                        self.model_tester.num_output_groups[i - 1] if i > 0 else seq_len,
-                    ],
-                )
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="GroupViTVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="GroupViTVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    # override since the attention mask from GroupViT is not used to compute loss, thus no grad
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "nvidia/groupvit-gcc-yfcc"
-        model = GroupViTVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class GroupViTTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        rng = random.Random(0)
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, rng=rng)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[batch_idx, :int(start_index)] = 1
-                input_mask[batch_idx, int(start_index):] = 0
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return GroupViTTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = GroupViTTextModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class GroupViTTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (GroupViTTextModel,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = GroupViTTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=GroupViTTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="GroupViTTextModel does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="GroupViTTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="GroupViTTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "nvidia/groupvit-gcc-yfcc"
-        model = GroupViTTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class GroupViTModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = GroupViTTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = GroupViTVisionModelTester(parent, **vision_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return GroupViTConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = GroupViTModel(config).set_train(False)
-        result = model(input_ids, pixel_values, attention_mask)
-        self.parent.assertEqual(
-            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "return_loss": True,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class GroupViTModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (GroupViTModel,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"feature-extraction": GroupViTModel} if is_mindspore_available() else {}
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = GroupViTModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="hidden_states are tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="input_embeds are tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="GroupViTModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # overwritten from parent as this equivalent test needs a specific `seed` and hard to get a good one!
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-5, name="outputs", attributes=None):
-        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol=tol, name=name, attributes=attributes)
-
-    # override as the `logit_scale` parameter initilization is different for GROUPVIT
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save GroupViTConfig and check if we can load GroupViTVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = GroupViTVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save GroupViTConfig and check if we can load GroupViTTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = GroupViTTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "nvidia/groupvit-gcc-yfcc"
-        model = GroupViTModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_vision
-@require_mindspore
-class GroupViTModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "nvidia/groupvit-gcc-yfcc"
-        model = GroupViTModel.from_pretrained(model_name)
-        processor = CLIPProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        inputs = processor(
-            text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="ms"
-        )
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(
-            outputs.logits_per_image.shape,
-            (inputs.pixel_values.shape[0], inputs.input_ids.shape[0]),
-        )
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            (inputs.input_ids.shape[0], inputs.pixel_values.shape[0]),
-        )
-
-        expected_logits = mindspore.tensor([[13.3523, 6.3629]])
-        print(outputs.logits_per_image.asnumpy())
-        self.assertTrue(np.allclose(outputs.logits_per_image.asnumpy(), expected_logits.asnumpy(), atol=1e-3))
diff --git a/tests/transformers/models/hubert/__init__.py b/tests/transformers/models/hubert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/hubert/test_modeling_hubert.py b/tests/transformers/models/hubert/test_modeling_hubert.py
deleted file mode 100644
index 5bbbe8cb4..000000000
--- a/tests/transformers/models/hubert/test_modeling_hubert.py
+++ /dev/null
@@ -1,739 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Hubert model."""
-
-import math
-import os
-import pickle
-import tempfile
-import unittest
-import pytest
-
-from mindnlp.transformers import HubertConfig
-from mindnlp.utils.testing_utils import require_soundfile, require_mindspore, slow
-from mindnlp.utils import is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, nn, no_grad, get_default_dtype
-
-    from mindnlp.transformers import (
-        HubertForCTC,
-        HubertForSequenceClassification,
-        HubertModel,
-        Wav2Vec2FeatureExtractor,
-        Wav2Vec2Processor,
-    )
-    from mindnlp.transformers.models.hubert.modeling_hubert import _compute_mask_indices
-
-
-class HubertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=1024,  # speech is longer
-        is_training=False,
-        hidden_size=16,
-        feat_extract_norm="group",
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        vocab_size=32,
-        do_stable_layer_norm=False,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.scope = scope
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length], scale=1.0)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        return config, input_values, attention_mask
-
-    def get_config(self):
-        return HubertConfig(
-            hidden_size=self.hidden_size,
-            feat_extract_norm=self.feat_extract_norm,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            do_stable_layer_norm=self.do_stable_layer_norm,
-        )
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = HubertModel(config=config)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = HubertModel(config=config)
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.bool_)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(ops.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = HubertForCTC(config=config)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels.tolist()) - 1), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_seq_classifier_loss(self, config, input_values, *args):
-        model = HubertForSequenceClassification(config=config)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-        unmasked_loss = model(input_values, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-
-    def check_ctc_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = HubertForCTC(config=config)
-        model.train()
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels.tolist()) - 2), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lengths are at least
-                # one shorter than logit lengths to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        loss.backward()
-
-    def check_seq_classifier_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = HubertForSequenceClassification(config=config)
-        model.train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = HubertForCTC(config)
-        model.train()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels.tolist()) - 2), model.config.vocab_size + 100)
-
-        with pytest.raises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class HubertModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (HubertForCTC, HubertForSequenceClassification, HubertModel) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "audio-classification": HubertForSequenceClassification,
-            "automatic-speech-recognition": HubertForCTC,
-            "feature-extraction": HubertModel,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = True
-    test_pruning = False
-    test_headmasking = False
-
-    def setUp(self):
-        self.model_tester = HubertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=HubertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    @unittest.skip(reason="Hubert has no inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Hubert has no inputs_embeds")
-    def test_forward_signature(self):
-        pass
-
-    # Hubert cannot resize token embeddings
-    # since it has no tokens embeddings
-    @unittest.skip(reason="Hubert has no tokens embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Hubert has no inputs_embeds")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "quantizer.weight_proj.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.fill_(3)
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class HubertRobustModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (HubertForCTC, HubertForSequenceClassification, HubertModel) if is_mindspore_available() else ()
-    test_pruning = False
-    test_headmasking = False
-
-    def setUp(self):
-        self.model_tester = HubertModelTester(
-            self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True
-        )
-        self.config_tester = ConfigTester(self, config_class=HubertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_batched_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    @unittest.skip(reason="Hubert has no inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Hubert has input_values instead of input_ids")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip(reason="Hubert has no tokens embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Hubert has no inputs_embeds")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "quantizer.weight_proj.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.fill_(3)
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class HubertUtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = ops.from_numpy(mask)
-
-        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = ops.from_numpy(mask)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-
-@require_mindspore
-@require_soundfile
-@slow
-class HubertModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def _load_superb(self, task, num_samples):
-        from datasets import load_dataset
-
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
-
-        return ds[:num_samples]
-
-    def test_inference_ctc_batched(self):
-        model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft", ms_dtype=mindspore.float16)
-        processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft", do_lower_case=True)
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values.half()
-        attention_mask = inputs.attention_mask
-
-        with no_grad():
-            logits = model(input_values, attention_mask=attention_mask).logits
-
-        predicted_ids = ops.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_keyword_spotting(self):
-        model = HubertForSequenceClassification.from_pretrained(
-            "superb/hubert-base-superb-ks", ms_dtype=mindspore.float16
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-base-superb-ks")
-        input_data = self._load_superb("ks", 4)
-        inputs = processor(input_data["speech"], return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values.half()
-        attention_mask = inputs.attention_mask
-        with no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = ops.max(outputs.logits, dim=-1)
-
-        expected_labels = [2, 6, 10, 9]
-        # s3prl logits for the same batch
-        expected_logits = mindspore.tensor([7.6692, 17.7795, 11.1562, 11.8232], dtype=mindspore.float16)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(ops.allclose(predicted_logits, expected_logits, atol=3e-2))
-
-    def test_inference_intent_classification(self):
-        model = HubertForSequenceClassification.from_pretrained(
-            "superb/hubert-base-superb-ic", ms_dtype=mindspore.float16
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-base-superb-ic")
-        input_data = self._load_superb("ic", 4)
-        inputs = processor(input_data["speech"], return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values.half()
-        attention_mask = inputs.attention_mask
-        with no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-
-        predicted_logits_action, predicted_ids_action = ops.max(outputs.logits[:, :6], dim=-1)
-        predicted_logits_object, predicted_ids_object = ops.max(outputs.logits[:, 6:20], dim=-1)
-        predicted_logits_location, predicted_ids_location = ops.max(outputs.logits[:, 20:24], dim=-1)
-
-        expected_labels_action = [1, 0, 4, 3]
-        expected_logits_action = mindspore.tensor(
-            [5.9052, 12.5865, 4.4840, 10.0240], dtype=mindspore.float16
-        )
-        expected_labels_object = [1, 10, 3, 4]
-        expected_logits_object = mindspore.tensor(
-            [5.5316, 11.7946, 8.1672, 23.2415], dtype=mindspore.float16
-        )
-        expected_labels_location = [0, 0, 0, 1]
-        expected_logits_location = mindspore.tensor(
-            [5.2053, 8.9577, 10.0447, 8.1481], dtype=mindspore.float16
-        )
-
-        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
-        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
-        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)
-
-        # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572
-        self.assertTrue(ops.allclose(predicted_logits_action, expected_logits_action, atol=3e-1))
-        self.assertTrue(ops.allclose(predicted_logits_object, expected_logits_object, atol=3e-1))
-        self.assertTrue(ops.allclose(predicted_logits_location, expected_logits_location, atol=3e-1))
-
-    def test_inference_speaker_identification(self):
-        model = HubertForSequenceClassification.from_pretrained(
-            "superb/hubert-base-superb-sid", ms_dtype=mindspore.float16
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-base-superb-sid")
-        input_data = self._load_superb("si", 4)
-
-        output_logits = []
-        with no_grad():
-            for example in input_data["speech"]:
-                input = processor(example, return_tensors="ms", padding=True)
-                output = model(input.input_values.half(), attention_mask=None)
-                output_logits.append(output.logits[0])
-
-        output_logits = ops.stack(output_logits)
-        predicted_logits, predicted_ids = ops.max(output_logits, dim=-1)
-
-        expected_labels = [5, 1, 1, 3]
-        # s3prl logits for the same batch
-        expected_logits = mindspore.tensor(
-            [78231.5547, 123166.6094, 122785.4141, 84851.2969], dtype=mindspore.float16
-        )
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572
-        self.assertTrue(ops.allclose(predicted_logits, expected_logits, atol=10))
-
-    def test_inference_emotion_recognition(self):
-        model = HubertForSequenceClassification.from_pretrained(
-            "superb/hubert-base-superb-er", ms_dtype=mindspore.float16
-        )
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/hubert-base-superb-er")
-        input_data = self._load_superb("er", 4)
-        inputs = processor(input_data["speech"], return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values.half()
-        attention_mask = inputs.attention_mask
-        with no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = ops.max(outputs.logits, dim=-1)
-
-        expected_labels = [1, 1, 2, 2]
-        # s3prl logits for the same batch
-        expected_logits = mindspore.tensor([2.8384, 2.3389, 3.8564, 4.5558], dtype=mindspore.float16)
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572
-        self.assertTrue(ops.allclose(predicted_logits, expected_logits, atol=1e-1))
-
-    def test_inference_distilhubert(self):
-        model = HubertModel.from_pretrained("ntu-spml/distilhubert")
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("ntu-spml/distilhubert")
-
-        # TODO: can't test on batched inputs due to incompatible padding https://github.com/pytorch/fairseq/pull/3572
-        input_speech = self._load_datasamples(1)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-
-        with no_grad():
-            outputs = model(input_values).last_hidden_state
-
-        # expected outputs taken from the original SEW implementation
-        expected_outputs_first = mindspore.tensor(
-            [
-                [
-                    [-0.3505, 0.1167, 0.0608, 0.1294],
-                    [-0.3085, 0.0481, 0.1106, 0.0955],
-                    [-0.3107, -0.0391, 0.0739, 0.1360],
-                    [-0.2385, -0.1795, -0.0928, 0.2389],
-                ]
-            ],
-        )
-        expected_outputs_last = mindspore.tensor(
-            [
-                [
-                    [-0.0732, 0.0255, 0.0529, -0.1372],
-                    [-0.0812, 0.1259, 0.0564, -0.0438],
-                    [-0.0054, 0.0758, -0.0002, -0.1617],
-                    [0.0133, -0.0320, -0.0687, 0.0062],
-                ]
-            ],
-        )
-        expected_output_sum = -3776.0730
-
-        self.assertTrue(ops.allclose(outputs[:, :4, :4], expected_outputs_first, atol=5e-3))
-        self.assertTrue(ops.allclose(outputs[:, -4:, -4:], expected_outputs_last, atol=5e-3))
-        print(outputs.sum() - expected_output_sum)
-        self.assertTrue(abs(outputs.sum() - expected_output_sum) < 0.1)
diff --git a/tests/transformers/models/ibert/__init__.py b/tests/transformers/models/ibert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/ibert/test_modeling_ibert.py b/tests/transformers/models/ibert/test_modeling_ibert.py
deleted file mode 100644
index 76bdaeb89..000000000
--- a/tests/transformers/models/ibert/test_modeling_ibert.py
+++ /dev/null
@@ -1,738 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import copy
-import unittest
-
-from mindnlp.transformers import IBertConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        IBertForMaskedLM,
-        IBertForMultipleChoice,
-        IBertForQuestionAnswering,
-        IBertForSequenceClassification,
-        IBertForTokenClassification,
-        IBertModel,
-    )
-    from mindnlp.transformers.models.ibert.modeling_ibert import (
-        IBertEmbeddings,
-        IntGELU,
-        IntLayerNorm,
-        IntSoftmax,
-        QuantAct,
-        QuantEmbedding,
-        QuantLinear,
-        create_position_ids_from_input_ids,
-    )
-
-
-class IBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return IBertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            quant_mode=True,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = IBertModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = IBertForMaskedLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = IBertForTokenClassification(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = IBertForMultipleChoice(config=config)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = IBertForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class IBertModelTest(ModelTesterMixin, unittest.TestCase):
-    test_pruning = False
-    test_torchscript = False
-    test_head_masking = False
-    test_resize_embeddings = False
-
-    all_model_classes = (
-        (
-            IBertForMaskedLM,
-            IBertModel,
-            IBertForSequenceClassification,
-            IBertForTokenClassification,
-            IBertForMultipleChoice,
-            IBertForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": IBertModel,
-            "fill-mask": IBertForMaskedLM,
-            "question-answering": IBertForQuestionAnswering,
-            "text-classification": IBertForSequenceClassification,
-            "token-classification": IBertForTokenClassification,
-            "zero-shot": IBertForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    def setUp(self):
-        self.model_tester = IBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=IBertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip
-    def test_tie_model_weights():
-        pass
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        # I-BERT only supports absolute embedding
-        for type in ["absolute"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "kssteven/ibert-roberta-base"
-        model = IBertModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_create_position_ids_respects_padding_index(self):
-        """This is a regression test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is IBertEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = IBertEmbeddings(config=config)
-
-        input_ids = ops.as_tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = ops.as_tensor(
-            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-        )
-
-        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(ops.all(ops.eq(position_ids, expected_positions)))
-
-    def test_create_position_ids_from_inputs_embeds(self):
-        """This is a regression test for https://github.com/huggingface/transformers/issues/1761
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is IBertEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = IBertEmbeddings(config=config)
-
-        inputs_embeds = ops.empty(2, 4, 30)
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = ops.as_tensor([expected_single_positions, expected_single_positions])
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(ops.all(ops.eq(position_ids, expected_positions)))
-
-    # Override
-    def test_model_get_set_embeddings(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), QuantEmbedding)
-            model.set_input_embeddings(nn.Embedding(10, 10))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    # Override
-    def test_feed_forward_chunking(self):
-        pass  # I-BERT does not support chunking
-
-    # Override
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                embed, embed_scaling_factor = wte(input_ids)
-                inputs["inputs_embeds"] = embed
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with no_grad():
-                model(**inputs)[0]
-
-    @unittest.skip(reason="ibert overrides scaling to None if inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-
-    @unittest.skip(reason="ibert training has error.")
-    def test_training(self):
-        pass
-
-
-@require_mindspore
-class IBertModelIntegrationTest(unittest.TestCase):
-    def test_quant_embedding(self):
-        weight_bit = 8
-        embedding = QuantEmbedding(2, 4, quant_mode=True, weight_bit=weight_bit)
-        embedding_weight = mindspore.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]])
-        embedding.weight = nn.Parameter(embedding_weight)
-
-        expected_scaling_factor = (embedding_weight.abs().max() / (2 ** (weight_bit - 1) - 1))
-        x, x_scaling_factor = embedding(mindspore.tensor(0))
-        y, y_scaling_factor = embedding(mindspore.tensor(1))
-
-        # scaling factor should follow the symmetric quantization rule
-        self.assertTrue(ops.allclose(x_scaling_factor, expected_scaling_factor, atol=1e-4))
-        self.assertTrue(ops.allclose(x_scaling_factor, expected_scaling_factor, atol=1e-4))
-        self.assertTrue(ops.allclose(y_scaling_factor, expected_scaling_factor, atol=1e-4))
-
-        # quantization error should not exceed the scaling factor
-        self.assertTrue(ops.allclose(x, embedding_weight[0], atol=expected_scaling_factor.item()))
-        self.assertTrue(ops.allclose(y, embedding_weight[1], atol=expected_scaling_factor.item()))
-
-    def test_quant_act(self):
-        def _test_range():
-            act = QuantAct(activation_bit, act_range_momentum, quant_mode=True)
-
-            # First pass
-            x = mindspore.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]])
-            x_scaling_factor = mindspore.tensor(1.0)
-            y, y_scaling_factor = act(x, x_scaling_factor)
-            y_int = y / y_scaling_factor
-
-            # After the first pass, x_min and x_max should be initialized with x.min() and x.max()
-            expected_x_min, expected_x_max = x.min(), x.max()
-            self.assertTrue(ops.allclose(act.x_min, expected_x_min, atol=1e-4))
-            self.assertTrue(ops.allclose(act.x_max, expected_x_max, atol=1e-4))
-
-            # scaling factor should follow the symmetric quantization rule
-            expected_range = ops.maximum(expected_x_min.abs(), expected_x_max.abs())
-            expected_scaling_factor = expected_range / (2 ** (activation_bit - 1) - 1)
-            self.assertTrue(ops.allclose(y_scaling_factor, expected_scaling_factor, atol=1e-4))
-
-            # quantization error should not exceed the scaling factor
-            self.assertTrue(ops.allclose(x, y, atol=expected_scaling_factor.item()))
-
-            # output should be integer
-            self.assertTrue(ops.allclose(y_int, y_int.round(), atol=1e-4))
-
-            # Second Pass
-            x = mindspore.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]]) * 2
-            x_scaling_factor = mindspore.tensor(1.0)
-            y, y_scaling_factor = act(x, x_scaling_factor)
-            y_int = y / y_scaling_factor
-
-            # From the second pass, x_min and x_max should be updated with moving average
-            expected_x_min = expected_x_min * act_range_momentum + x.min() * (1 - act_range_momentum)
-            expected_x_max = expected_x_max * act_range_momentum + x.max() * (1 - act_range_momentum)
-            self.assertTrue(ops.allclose(act.x_min, expected_x_min, atol=1e-4))
-            self.assertTrue(ops.allclose(act.x_max, expected_x_max, atol=1e-4))
-
-            # scaling factor should follow the symmetric quantization rule
-            expected_range = ops.maximum(expected_x_min.abs(), expected_x_max.abs())
-            expected_scaling_factor = expected_range / (2 ** (activation_bit - 1) - 1)
-            self.assertTrue(ops.allclose(y_scaling_factor, expected_scaling_factor, atol=1e-4))
-
-            # quantization error should not exceed the scaling factor
-            x = x.clamp(min=-expected_range, max=expected_range)
-            self.assertTrue(ops.allclose(x, y, atol=expected_scaling_factor.item()))
-
-            # output should be integer
-            self.assertTrue(ops.allclose(y_int, y_int.round(), atol=1e-4))
-
-            # Third pass, with eval()
-            act.eval()
-            x = mindspore.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]]) * 3
-
-            # In eval mode, min/max and scaling factor must be fixed
-            self.assertTrue(ops.allclose(act.x_min, expected_x_min, atol=1e-4))
-            self.assertTrue(ops.allclose(act.x_max, expected_x_max, atol=1e-4))
-            self.assertTrue(ops.allclose(y_scaling_factor, expected_scaling_factor, atol=1e-4))
-
-        def _test_identity():
-            # test if identity and identity_scaling_factor are given
-            # should add the input values
-            act = QuantAct(activation_bit, act_range_momentum, quant_mode=True)
-            x = mindspore.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]])
-            y = mindspore.tensor([[6.0, -7.0, 1.0, -2.0], [3.0, -4.0, -8.0, 5.0]])
-            x_scaling_factor = mindspore.tensor(1.0)
-            y_scaling_factor = mindspore.tensor(0.5)
-            z, z_scaling_factor = act(x, x_scaling_factor, y, y_scaling_factor)
-            z_int = z / z_scaling_factor
-            self.assertTrue(ops.allclose(x + y, z, atol=0.1))
-            self.assertTrue(ops.allclose(z_int, z_int.round(), atol=1e-4))
-
-        activation_bit = 8
-        act_range_momentum = 0.95
-        _test_range()
-        _test_identity()
-
-    def test_quant_linear(self):
-        def _test(per_channel):
-            linear_q = QuantLinear(2, 4, quant_mode=True, per_channel=per_channel, weight_bit=weight_bit)
-            linear_dq = QuantLinear(2, 4, quant_mode=False, per_channel=per_channel, weight_bit=weight_bit)
-            linear_weight = mindspore.tensor([[-1.0, 2.0, 3.0, -4.0], [5.0, -6.0, -7.0, 8.0]]).T
-            linear_q.weight = nn.Parameter(linear_weight)
-            linear_dq.weight = nn.Parameter(linear_weight)
-
-            q, q_scaling_factor = linear_q(x, x_scaling_factor)
-            q_int = q / q_scaling_factor
-            dq, dq_scaling_factor = linear_dq(x, x_scaling_factor)
-
-            if per_channel:
-                q_max = ops.max(linear_weight.abs(), dim=1)[0]
-            else:
-                q_max = linear_weight.abs().max()
-            expected_scaling_factor = q_max / (2 ** (weight_bit - 1) - 1)
-
-            # scaling factor should follow the symmetric quantization rule
-            self.assertTrue(ops.allclose(linear_q.fc_scaling_factor, expected_scaling_factor, atol=1e-4))
-
-            # output of the normal linear layer and the quantized linear layer should be similar
-            self.assertTrue(ops.allclose(q, dq, atol=0.5))
-
-            # output of the quantized linear layer should be integer
-            self.assertTrue(ops.allclose(q_int, q_int.round(), atol=1e-4))
-
-        weight_bit = 8
-        x = mindspore.tensor([[2.0, -5.0], [-3.0, 4.0]])
-        x_scaling_factor = mindspore.tensor([1.0])
-        _test(True)
-        _test(False)
-
-    def test_int_gelu(self):
-        gelu_q = IntGELU(quant_mode=True)
-        gelu_dq = nn.GELU()
-
-        x_int = ops.arange(-10000, 10001, 1)
-        x_scaling_factor = mindspore.tensor(0.001)
-        x = x_int * x_scaling_factor
-
-        q, q_scaling_factor = gelu_q(x, x_scaling_factor)
-        q_int = q / q_scaling_factor
-        dq = gelu_dq(x)
-
-        # output of the normal GELU and the quantized GELU should be similar
-        self.assertTrue(ops.allclose(q, dq, atol=0.5))
-
-        # output of the quantized GELU layer should be integer
-        self.assertTrue(ops.allclose(q_int, q_int.round(), atol=1e-4))
-
-    def test_force_dequant_gelu(self):
-        x_int = ops.arange(-10000, 10001, 1)
-        x_scaling_factor = mindspore.tensor(0.001)
-        x = x_int * x_scaling_factor
-
-        gelu_dq = IntGELU(quant_mode=False)
-        gelu_fdqs_dict = {
-            True: [
-                IntGELU(quant_mode=True, force_dequant="nonlinear"),
-                IntGELU(quant_mode=True, force_dequant="gelu"),
-            ],
-            False: [
-                IntGELU(quant_mode=True, force_dequant="none"),
-                IntGELU(quant_mode=True, force_dequant="softmax"),
-                IntGELU(quant_mode=True, force_dequant="layernorm"),
-            ],
-        }
-
-        dq, dq_scaling_factor = gelu_dq(x, x_scaling_factor)
-        for label, gelu_fdqs in gelu_fdqs_dict.items():
-            for gelu_fdq in gelu_fdqs:
-                q, q_scaling_factor = gelu_fdq(x, x_scaling_factor)
-                if label:
-                    self.assertTrue(ops.allclose(q, dq, atol=1e-4))
-                else:
-                    self.assertFalse(ops.allclose(q, dq, atol=1e-4))
-
-    def test_int_softmax(self):
-        output_bit = 8
-        softmax_q = IntSoftmax(output_bit, quant_mode=True)
-        softmax_dq = nn.Softmax()
-
-        def _test(array):
-            x_int = mindspore.tensor(array)
-            x_scaling_factor = mindspore.tensor(0.1)
-            x = x_int * x_scaling_factor
-
-            q, q_scaling_factor = softmax_q(x, x_scaling_factor)
-            q_int = q / q_scaling_factor
-            dq = softmax_dq(x)
-
-            # output of the normal Softmax and the quantized Softmax should be similar
-            self.assertTrue(ops.allclose(q, dq, atol=0.5))
-
-            # output of the quantized GELU layer should be integer
-            self.assertTrue(ops.allclose(q_int, q_int.round(), atol=1e-4))
-
-            # Output of the quantize Softmax should not exceed the output_bit
-            self.assertTrue(q.abs().max() < 2**output_bit)
-
-        array = [[i + j for j in range(10)] for i in range(-10, 10)]
-        _test(array)
-        array = [[i + j for j in range(50)] for i in range(-10, 10)]
-        _test(array)
-        array = [[i + 100 * j for j in range(2)] for i in range(-10, 10)]
-        _test(array)
-
-    def test_force_dequant_softmax(self):
-        output_bit = 8
-        array = [[i + j for j in range(10)] for i in range(-10, 10)]
-        x_int = mindspore.tensor(array)
-        x_scaling_factor = mindspore.tensor(0.1)
-        x = x_int * x_scaling_factor
-
-        softmax_dq = IntSoftmax(output_bit, quant_mode=False)
-        softmax_fdqs_dict = {
-            True: [
-                IntSoftmax(output_bit, quant_mode=True, force_dequant="nonlinear"),
-                IntSoftmax(output_bit, quant_mode=True, force_dequant="softmax"),
-            ],
-            False: [
-                IntSoftmax(output_bit, quant_mode=True, force_dequant="none"),
-                IntSoftmax(output_bit, quant_mode=True, force_dequant="gelu"),
-                IntSoftmax(output_bit, quant_mode=True, force_dequant="layernorm"),
-            ],
-        }
-
-        dq, dq_scaling_factor = softmax_dq(x, x_scaling_factor)
-        for label, softmax_fdqs in softmax_fdqs_dict.items():
-            for softmax_fdq in softmax_fdqs:
-                q, q_scaling_factor = softmax_fdq(x, x_scaling_factor)
-                if label:
-                    self.assertTrue(ops.allclose(q, dq, atol=1e-4))
-                else:
-                    self.assertFalse(ops.allclose(q, dq, atol=1e-4))
-
-    def test_int_layernorm(self):
-        output_bit = 8
-
-        # some random matrix
-        array = [[[i * j * j + j for j in range(5, 15)]] for i in range(-10, 10)]
-        x_int = mindspore.tensor(array)
-        x_scaling_factor = mindspore.tensor(0.1)
-        x = x_int * x_scaling_factor
-
-        ln_q = IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit)
-        ln_dq = nn.LayerNorm(x.shape[1:], 1e-5)
-
-        ln_q.weight = nn.Parameter(ops.ones(x.shape[1:]))
-        ln_q.bias = nn.Parameter(ops.ones(x.shape[1:]))
-        ln_dq.weight = nn.Parameter(ops.ones(x.shape[1:]))
-        ln_dq.bias = nn.Parameter(ops.ones(x.shape[1:]))
-
-        q, q_scaling_factor = ln_q(x, x_scaling_factor)
-        q_int = q / q_scaling_factor
-        dq = ln_dq(x)
-
-        # output of the normal LN and the quantized LN should be similar
-        self.assertTrue(ops.allclose(q, dq, atol=0.5))
-
-        # output of the quantized GELU layer should be integer
-        self.assertTrue(ops.allclose(q_int, q_int.round(), atol=1e-4))
-
-    def test_force_dequant_layernorm(self):
-        output_bit = 8
-        array = [[[i * j * j + j for j in range(5, 15)]] for i in range(-10, 10)]
-        x_int = mindspore.tensor(array)
-        x_scaling_factor = mindspore.tensor(0.1)
-        x = x_int * x_scaling_factor
-
-        ln_dq = IntLayerNorm(x.shape[1:], 1e-5, quant_mode=False, output_bit=output_bit)
-        ln_fdqs_dict = {
-            True: [
-                IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="nonlinear"),
-                IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="layernorm"),
-            ],
-            False: [
-                IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="none"),
-                IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="gelu"),
-                IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit, force_dequant="softmax"),
-            ],
-        }
-
-        ln_dq.weight = nn.Parameter(ops.ones(x.shape[1:]))
-        ln_dq.bias = nn.Parameter(ops.ones(x.shape[1:]))
-        dq, dq_scaling_factor = ln_dq(x, x_scaling_factor)
-        for label, ln_fdqs in ln_fdqs_dict.items():
-            for ln_fdq in ln_fdqs:
-                ln_fdq.weight = nn.Parameter(ops.ones(x.shape[1:]))
-                ln_fdq.bias = nn.Parameter(ops.ones(x.shape[1:]))
-                q, q_scaling_factor = ln_fdq(x, x_scaling_factor)
-                if label:
-                    self.assertTrue(ops.allclose(q, dq, atol=1e-4))
-                else:
-                    self.assertFalse(ops.allclose(q, dq, atol=1e-4))
-
-    def quantize(self, model):
-        # Helper function that quantizes the given model
-        # Recursively convert all the `quant_mode` attributes as `True`
-        if hasattr(model, "quant_mode"):
-            model.quant_mode = True
-        elif isinstance(model, nn.Sequential):
-            for n, m in model.named_children():
-                self.quantize(m)
-        elif isinstance(model, nn.ModuleList):
-            for n in model:
-                self.quantize(n)
-        else:
-            for attr in dir(model):
-                mod = getattr(model, attr)
-                if isinstance(mod, nn.Module) and mod != model:
-                    self.quantize(mod)
-
-    @slow
-    def test_inference_masked_lm(self):
-        # I-BERT should be "equivalent" to RoBERTa if not quantized
-        # Test coped from `test_modeling_roberta.py`
-        model = IBertForMaskedLM.from_pretrained("kssteven/ibert-roberta-base")
-        input_ids = mindspore.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = (1, 11, 50265)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
-        )
-        self.assertTrue(ops.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-        # I-BERT should be "similar" to RoBERTa if quantized
-        self.quantize(model)
-        output = model(input_ids)[0]
-        self.assertEqual(output.shape, expected_shape)
-        self.assertTrue(ops.allclose(output[:, :3, :3], expected_slice, atol=0.1))
-
-    @slow
-    def test_inference_classification_head(self):
-        # I-BERT should be "equivalent" to RoBERTa if not quantized
-        # Test coped from `test_modeling_roberta.py`
-        model = IBertForSequenceClassification.from_pretrained("kssteven/ibert-roberta-large-mnli")
-        input_ids = mindspore.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = (1, 3)
-        self.assertEqual(output.shape, expected_shape)
-        expected_tensor = mindspore.tensor([[-0.9469, 0.3913, 0.5118]])
-        self.assertTrue(ops.allclose(output, expected_tensor, atol=1e-4))
-
-        # I-BERT should be "similar" to RoBERTa if quantized
-        self.quantize(model)
-        output = model(input_ids)[0]
-        self.assertEqual(output.shape, expected_shape)
-        self.assertTrue(ops.allclose(output, expected_tensor, atol=0.1))
\ No newline at end of file
diff --git a/tests/transformers/models/idefics/__init__.py b/tests/transformers/models/idefics/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/idefics/test_image_processing_idefics.py b/tests/transformers/models/idefics/test_image_processing_idefics.py
deleted file mode 100644
index 550e42bd8..000000000
--- a/tests/transformers/models/idefics/test_image_processing_idefics.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-if is_mindspore_available():
-    import mindspore
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import IdeficsImageProcessor
-
-
-class IdeficsImageProcessingTester(unittest.TestCase):
-    def __init__(
-            self,
-            parent,
-            batch_size=7,
-            num_channels=3,
-            image_size=18,
-            min_resolution=30,
-            max_resolution=400,
-            size=None,
-            image_mean=[0.48145466, 0.4578275, 0.40821073],
-            image_std=[0.26862954, 0.26130258, 0.27577711],
-    ):
-        size = size if size is not None else {"shortest_edge": 30}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        # self.size = size
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_image_processor_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "image_size": self.image_size,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to IdeficsImageProcessor,
-        assuming do_resize is set to True with a scalar size and size_divisor.
-        """
-        if not batched:
-            size = self.image_size
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            elif isinstance(image, np.ndarray):
-                h, w = image.shape[0], image.shape[1]
-            else:
-                h, w = image.shape[1], image.shape[2]
-            scale = size / min(w, h)
-            if h < w:
-                newh, neww = size, scale * w
-            else:
-                newh, neww = scale * h, size
-
-            max_size = int((1333 / 800) * size)
-            if max(newh, neww) > max_size:
-                scale = max_size / max(newh, neww)
-                newh = newh * scale
-                neww = neww * scale
-
-            newh, neww = int(newh + 0.5), int(neww + 0.5)
-            expected_height, expected_width = (
-                newh // self.size_divisor * self.size_divisor,
-                neww // self.size_divisor * self.size_divisor,
-            )
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-    def expected_output_image_shape(self, images):
-        height, width = self.get_expected_values(images, batched=True)
-        return (self.num_channels, height, width)
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class IdeficsImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = IdeficsImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = IdeficsImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "image_size"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertNotEqual(image_processor.image_size, 30)
-
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, image_size=42)
-        self.assertEqual(image_processor.image_size, 42)
-
-    @unittest.skip("no torchvision")
-    # @require_torchvision
-    def test_torchvision_numpy_transforms_equivalency(self):
-        pass
-        # as we had to reimplement the torchvision transforms using transformers utils we must check
-        # they both do the same
-
-        # image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        # image_processor = self.image_processing_class(**self.image_processor_dict, return_tensors="ms")
-        #
-        # print(image_inputs)
-        #
-        # def convert_to_rgb(image):
-        #     # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
-        #     # for transparent images. The call to `alpha_composite` handles this case
-        #     if image.mode == "RGB":
-        #         return image
-        #
-        #     image_rgba = image.convert("RGBA")
-        #     background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
-        #     alpha_composite = Image.alpha_composite(background, image_rgba)
-        #     alpha_composite = alpha_composite.convert("RGB")
-        #     return alpha_composite
-        #
-        # image_size = image_processor.image_size
-        # image_mean = image_processor.image_mean
-        # image_std = image_processor.image_std
-        #
-        # transform = transforms.Compose(
-        #     [
-        #         convert_to_rgb,
-        #         transforms.Resize((image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC),
-        #         transforms.ToTensor(),
-        #         transforms.Normalize(mean=image_mean, std=image_std),
-        #     ]
-        # )
-        #
-        # pixel_values_transform_implied = image_processor(image_inputs, transform=None, return_tensors="ms")
-        # pixel_values_transform_supplied = image_processor(image_inputs, transform=transform, return_tensors="ms")
-        #
-        # torch.testing.assert_close(pixel_values_transform_implied, pixel_values_transform_supplied, rtol=0.0, atol=0.0)
-
-    @unittest.skip(reason="not supported")
-    def test_call_numpy(self):
-        pass
-
-    @unittest.skip(reason="not supported")
-    def test_call_numpy_4_channels(self):
-        pass
-
-    @unittest.skip(reason="not supported")
-    def test_call_pil(self):
-        pass
-
-    @unittest.skip(reason="not supported")
-    def test_call_pytorch(self):
-        pass
diff --git a/tests/transformers/models/idefics/test_modeling_idefics.py b/tests/transformers/models/idefics/test_modeling_idefics.py
deleted file mode 100644
index 6f65efa38..000000000
--- a/tests/transformers/models/idefics/test_modeling_idefics.py
+++ /dev/null
@@ -1,668 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Idefics model."""
-
-import unittest
-
-from parameterized import parameterized
-
-from mindnlp.transformers.models.idefics import IdeficsConfig
-from mindnlp.utils import is_mindspore_available, is_vision_available
-from mindnlp.utils.testing_utils import (
-    TestCasePlus,
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import cached_property
-from mindnlp.core import ops
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import IdeficsForVisionText2Text, IdeficsModel, IdeficsProcessor
-    from mindnlp.transformers.models.idefics.configuration_idefics import IdeficsPerceiverConfig, IdeficsVisionConfig
-
-if is_vision_available():
-    from PIL import Image
-
-
-class IdeficsModelTester:
-    def __init__(
-            self,
-            parent,
-            batch_size=1,
-            seq_length=7,
-            image_size=30,
-            patch_size=2,
-            num_channels=3,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            alpha_initializer="ones",
-            num_labels=3,
-            scope=None,
-            modality_type_vocab_size=2,
-            vision_embed_dim=32,
-            vision_patch_size=2,
-            vision_image_size=30,
-            vision_num_attention_heads=4,
-            vision_num_hidden_layers=5,
-            vision_intermediate_size=37,
-            perceiver_qk_layer_norms_perceiver=False,
-            perceiver_resampler_depth=2,
-            perceiver_resampler_head_dim=8,
-            perceiver_resampler_n_heads=2,
-            perceiver_resampler_n_latents=16,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.alpha_initializer = alpha_initializer
-        self.num_labels = num_labels
-        self.scope = scope
-        self.modality_type_vocab_size = modality_type_vocab_size
-
-        self.vision_embed_dim = vision_embed_dim
-        self.vision_patch_size = vision_patch_size
-        self.vision_image_size = vision_image_size
-        self.vision_num_attention_heads = vision_num_attention_heads
-        self.vision_num_hidden_layers = vision_num_hidden_layers
-        self.vision_intermediate_size = vision_intermediate_size
-
-        self.vision_config = IdeficsVisionConfig(
-            embed_dim=self.vision_embed_dim,
-            patch_size=self.vision_patch_size,
-            image_size=self.vision_image_size,
-            num_attention_heads=self.vision_num_attention_heads,
-            num_hidden_layers=self.vision_num_hidden_layers,
-            intermediate_size=self.vision_intermediate_size,
-        )
-
-        self.perceiver_qk_layer_norms_perceiver = perceiver_qk_layer_norms_perceiver
-        self.perceiver_resampler_depth = perceiver_resampler_depth
-        self.perceiver_resampler_head_dim = perceiver_resampler_head_dim
-        self.perceiver_resampler_n_heads = perceiver_resampler_n_heads
-        self.perceiver_resampler_n_latents = perceiver_resampler_n_latents
-
-        self.perceiver_config = IdeficsPerceiverConfig(
-            qk_layer_norms_perceiver=self.perceiver_qk_layer_norms_perceiver,
-            resampler_depth=self.perceiver_resampler_depth,
-            resampler_head_dim=self.perceiver_resampler_head_dim,
-            resampler_n_heads=self.perceiver_resampler_n_heads,
-            resampler_n_latents=self.perceiver_resampler_n_latents,
-        )
-
-        # we set the expected sequence length (which is used in several tests)
-        # this is equal to the seq length of the text tokens + number of image patches + 1 for the CLS token
-        self.expected_seq_len = self.seq_length + (self.image_size // self.patch_size) ** 2 + 1
-
-    def prepare_config_and_inputs(self, num_images=1, interpolate_pos_encoding=False, image_expansion=0):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        pixel_values = floats_tensor(
-            [
-                self.batch_size,
-                num_images,
-                self.num_channels,
-                self.image_size + image_expansion,
-                self.image_size + image_expansion,
-            ]
-        )
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        image_attention_mask = random_attention_mask([self.batch_size, self.seq_length, num_images])
-
-        config = self.get_config()
-        return (config, input_ids, input_mask, pixel_values, image_attention_mask, interpolate_pos_encoding)
-
-    def prepare_config_and_inputs_gate_tests(self):
-        # Create a list of configs and inputs, to test 2 things:
-        # 1. For the same image, the output should be different when image_attention_mask is filled with 0s vs filled with 1s.
-        # 2. For 2 different images, the output should be the same when image_attention_mask is filled with 0s.
-
-        interpolate_pos_encoding = False
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        pixel_values = floats_tensor(
-            [
-                self.batch_size,
-                1,
-                self.num_channels,
-                self.image_size,
-                self.image_size,
-            ]
-        )
-        pixel_values_list = [
-            pixel_values.copy(),
-            pixel_values.copy(),
-            pixel_values.copy().fill(0.6),
-            pixel_values.copy().fill(0.3),
-        ]
-        attention_mask = None
-        if self.use_input_mask:
-            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        image_attention_mask = random_attention_mask([self.batch_size, self.seq_length, 1])
-        image_attention_mask_list = [
-            image_attention_mask.copy().fill(0),
-            image_attention_mask.copy().fill(1),
-            image_attention_mask.copy().fill(0),
-            image_attention_mask.copy().fill(0),
-        ]
-
-        config = self.get_config()
-        inputs_list = []
-        for pixel_values, image_attention_mask in zip(pixel_values_list, image_attention_mask_list):
-            inputs_list.append(
-                {
-                    "input_ids": input_ids,
-                    "attention_mask": attention_mask,
-                    "pixel_values": pixel_values,
-                    "image_attention_mask": image_attention_mask,
-                    "interpolate_pos_encoding": interpolate_pos_encoding,
-                }
-            )
-
-        inputs_w_same_img = inputs_list[:2]
-        inputs_w_0_img_attn = inputs_list[2:]
-        return config, inputs_w_same_img, inputs_w_0_img_attn
-
-    def get_config(self):
-        return IdeficsConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            alpha_initializer=self.alpha_initializer,
-            num_labels=self.num_labels,
-            modality_type_vocab_size=self.modality_type_vocab_size,
-            vision_config=self.vision_config,
-        )
-
-    def create_and_check_model(
-            self,
-            config,
-            input_ids,
-            input_mask,
-            pixel_values,
-            image_attention_mask,
-            interpolate_pos_encoding,
-    ):
-
-        model = IdeficsModel(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            pixel_values=pixel_values,
-            image_attention_mask=image_attention_mask,
-            interpolate_pos_encoding=interpolate_pos_encoding,
-        )
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, input_ids.shape[1], self.hidden_size)
-        )
-
-    def create_and_check_model_gen(
-            self,
-            config,
-            input_ids,
-            input_mask,
-            pixel_values,
-            image_attention_mask,
-            interpolate_pos_encoding,
-    ):
-        model = IdeficsForVisionText2Text(config)
-        model.eval()
-        model.generate(
-            input_ids,
-            attention_mask=input_mask,
-            pixel_values=pixel_values,
-            image_attention_mask=image_attention_mask,
-            interpolate_pos_encoding=interpolate_pos_encoding,
-            max_length=self.seq_length + 2,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_mask,
-            pixel_values,
-            image_attention_mask,
-            interpolate_pos_encoding,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "pixel_values": pixel_values,
-            "image_attention_mask": image_attention_mask,
-            "interpolate_pos_encoding": interpolate_pos_encoding,
-        }
-        return config, inputs_dict
-
-    def prepare_pixel_values(self):
-        return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-    @unittest.skip("no SDPA")
-    # @require_torch_sdpa
-    @slow
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test")
-
-
-@require_mindspore
-class IdeficsModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_mindspore_available() else ()
-    # pipeline_model_mapping = {"feature-extraction": IdeficsModel} if is_mindspore_available() else {}
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-        # XXX: IdeficsForVisionText2TextTest has no MODEL_FOR group yet, but it should be the same
-        # as MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, so for now manually changing to do the right thing
-        # as super won't do it
-        if return_labels:
-            inputs_dict["labels"] = ops.zeros(
-                (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int64
-            )
-
-        return inputs_dict
-
-    def test_model_outputs_equivalence(self):
-        try:
-            orig = self.all_model_classes
-            # IdeficsModel.forward doesn't have labels input arg - only IdeficsForVisionText2Text does
-            self.all_model_classes = (IdeficsForVisionText2Text,) if is_mindspore_available() else ()
-            super().test_model_outputs_equivalence()
-        finally:
-            self.all_model_classes = orig
-
-    def setUp(self):
-        self.model_tester = IdeficsModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model_single_image(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=1, interpolate_pos_encoding=False, image_expansion=0
-        )
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_multiple_images(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=2, interpolate_pos_encoding=False, image_expansion=0
-        )
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_image_pos_embeddings_interpolation_single_image(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=1, interpolate_pos_encoding=True, image_expansion=2
-        )
-        self.model_tester.create_and_check_model(*config_and_inputs)
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=1, interpolate_pos_encoding=True, image_expansion=0
-        )
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_image_pos_embeddings_interpolation_multiple_images(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=2, interpolate_pos_encoding=True, image_expansion=2
-        )
-        self.model_tester.create_and_check_model(*config_and_inputs)
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=2, interpolate_pos_encoding=True, image_expansion=0
-        )
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_generate_with_image_pos_embeddings_interpolation_single_image(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=1, interpolate_pos_encoding=True, image_expansion=2
-        )
-        self.model_tester.create_and_check_model_gen(*config_and_inputs)
-
-    def test_generate_with_image_pos_embeddings_interpolation_multiple_images(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            num_images=2, interpolate_pos_encoding=True, image_expansion=2
-        )
-        self.model_tester.create_and_check_model_gen(*config_and_inputs)
-
-    def test_cross_attention_gates(self):
-        config, inputs_w_same_img, inputs_w_0_img_attn = self.model_tester.prepare_config_and_inputs_gate_tests()
-
-        model = IdeficsModel(config=config)
-        model.eval()
-        test_1_results = []
-        for inputs in inputs_w_same_img:
-            with mindspore._no_grad():
-                last_hidden_states = model(**inputs).last_hidden_state
-            last_hidden_states = model(**inputs).last_hidden_state
-            test_1_results.append(last_hidden_states)
-        self.assertNotEqual(test_1_results[0].sum().item(), test_1_results[1].sum().item())
-
-        test_2_results = []
-        for inputs in inputs_w_0_img_attn:
-            with mindspore._no_grad():
-                last_hidden_states = model(**inputs).last_hidden_state
-            test_2_results.append(last_hidden_states)
-        self.assertEqual(test_2_results[0].sum().item(), test_2_results[1].sum().item())
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="model_tester.is_training is set to False")
-
-        for model_class in self.all_model_classes:
-            # IdeficsModel does not support training, users should use
-            # IdeficsForVisionText2Text for this purpose
-            if model_class == IdeficsModel:
-                self.skipTest(reason="IdeficsModel does not support training")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            model = model_class(config)
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            # loss.backward()
-
-    def test_training_gradient_checkpointing(self):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="model_tester.is_training is set to False")
-
-        for model_class in self.all_model_classes:
-            # IdeficsModel does not support training, users should use
-            # IdeficsForVisionText2Text for this purpose
-            if model_class == IdeficsModel:
-                self.skipTest(reason="IdeficsModel does not support training")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.use_cache = False
-            config.return_dict = True
-
-            model = model_class(config)
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            # loss.backward()
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="""IDEFICS does not support retaining the gradients of the hidden states and attention""")
-    def test_retain_grad_hidden_states_attentions(self):
-        return
-
-    def test_attention_outputs(self):
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            with mindspore._no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with mindspore._no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            # IDEFICS does not support outputting attention score becuase it uses SDPA under the hood
-            self.assertTrue(attentions[0] is None)
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with mindspore._no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            self.assertEqual(out_len + 1, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            # IDEFICS does not support outputting attention score becuase it uses SDPA under the hood
-            self.assertTrue(self_attentions[0] is None)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with mindspore._no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    # @is_pt_tf_cross_test
-    # def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
-    #     self.has_attentions = False
-    #     super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "HuggingFaceM4/idefics-9b"
-        # model_name = "HuggingFaceM4/tiny-random-idefics"
-
-        model = IdeficsModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip("no SDPA")
-    # @require_torch_sdpa
-    @slow
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test")
-
-
-@require_mindspore
-class IdeficsForVisionText2TextTest(IdeficsModelTest, unittest.TestCase):
-    all_model_classes = (IdeficsForVisionText2Text,) if is_mindspore_available() else ()
-
-    def setUp(self):
-        self.model_tester = IdeficsModelTester(
-            self,
-            modality_type_vocab_size=3,
-        )
-        self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37)
-
-    @unittest.skip(reason="We only test the model that takes in multiple images")
-    def test_model(self):
-        pass
-
-    @unittest.skip(reason="We only test the model that takes in multiple images")
-    def test_for_token_classification(self):
-        pass
-
-    @unittest.skip(reason="""IDEFICS does not support retaining the gradients of the hidden states and attention""")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-
-@require_mindspore
-@require_vision
-class IdeficsModelIntegrationTest(TestCasePlus):
-    @cached_property
-    def default_processor(self):
-        return (
-            IdeficsProcessor.from_pretrained("HuggingFaceM4/idefics-9b", revision="refs/pr/11")
-            # IdeficsProcessor.from_pretrained("HuggingFaceM4/tiny-random-idefics", revision="refs/pr/11")
-            if is_vision_available()
-            else None
-        )
-
-    @unittest.skip("no bitsandbytes")
-    # @require_bitsandbytes
-    @slow
-    def test_inference_natural_language_visual_reasoning(self):
-        pass
-        # cat_image_path = self.tests_dir / "fixtures/tests_samples/COCO/000000039769.png"
-        # cats_image_obj = Image.open(cat_image_path)  # 2 cats
-        # dogs_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
-        # 
-        # prompts = [
-        #     [
-        #         "User:",
-        #         dogs_image_url,
-        #         "Describe this image.\nAssistant: An image of two dogs.\n",
-        #         "User:",
-        #         cats_image_obj,
-        #         "Describe this image.\nAssistant:",
-        #     ],
-        #     [
-        #         "User:",
-        #         cats_image_obj,
-        #         "Describe this image.\nAssistant: An image of two kittens.\n",
-        #         "User:",
-        #         dogs_image_url,
-        #         "Describe this image.\nAssistant:",
-        #     ],
-        # ]
-        # 
-        # # the CI gpu is small so using quantization to fit
-        # quantization_config = BitsAndBytesConfig(
-        #     load_in_4bit=True,
-        #     bnb_4bit_compute_dtype="float16",
-        # )
-        # model = IdeficsForVisionText2Text.from_pretrained(
-        #     # "HuggingFaceM4/idefics-9b", quantization_config=quantization_config, device_map="auto"
-        #     "HuggingFaceM4/tiny-random-idefics",quantization_config=quantization_config, device_map="auto"
-        # )
-        # processor = self.default_processor
-        # inputs = processor(prompts, return_tensors="ms", padding="longest").to(torch_device)
-        # generated_ids = model.generate(**inputs, max_length=100)
-        # generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
-        # 
-        # # keep for debugging
-        # for i, t in enumerate(generated_text):
-        #     t = bytes(t, "utf-8").decode("unicode_escape")
-        #     print(f"{i}:\n{t}\n")
-        # 
-        # self.assertIn("image of two cats", generated_text[0])
-        # self.assertIn("image of two dogs", generated_text[1])
diff --git a/tests/transformers/models/idefics/test_processor_idefics.py b/tests/transformers/models/idefics/test_processor_idefics.py
deleted file mode 100644
index 53c0fa65a..000000000
--- a/tests/transformers/models/idefics/test_processor_idefics.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-from mindnlp.utils.testing_utils import TestCasePlus, require_mindspore, require_vision
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-if is_mindspore_available():
-    import mindspore
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import (
-        AutoProcessor,
-        IdeficsImageProcessor,
-        IdeficsProcessor,
-        LlamaTokenizerFast,
-        PreTrainedTokenizerFast,
-    )
-
-
-@require_mindspore
-@require_vision
-class IdeficsProcessorTest(TestCasePlus):
-    def setUp(self):
-        super().setUp()
-
-        self.checkpoint_path = self.get_auto_remove_tmp_dir()
-
-        image_processor = IdeficsImageProcessor(return_tensors="ms")
-        tokenizer = LlamaTokenizerFast.from_pretrained("HuggingFaceM4/tiny-random-idefics")
-
-        processor = IdeficsProcessor(image_processor, tokenizer)
-
-        processor.save_pretrained(self.checkpoint_path)
-
-        self.input_keys = ["pixel_values", "input_ids", "attention_mask", "image_attention_mask"]
-
-    def get_tokenizer(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.checkpoint_path, **kwargs).tokenizer
-
-    def get_image_processor(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.checkpoint_path, **kwargs).image_processor
-
-    def prepare_prompts(self):
-        """This function prepares a list of PIL images"""
-
-        num_images = 2
-        images = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8) for x in range(num_images)]
-        images = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in images]
-
-        # print([type(x) for x in images])
-        # die
-
-        prompts = [
-            # text and 1 image
-            [
-                "User:",
-                images[0],
-                "Describe this image.\nAssistant:",
-            ],
-            # text and images
-            [
-                "User:",
-                images[0],
-                "Describe this image.\nAssistant: An image of two dogs.\n",
-                "User:",
-                images[1],
-                "Describe this image.\nAssistant:",
-            ],
-            # only text
-            [
-                "User:",
-                "Describe this image.\nAssistant: An image of two kittens.\n",
-                "User:",
-                "Describe this image.\nAssistant:",
-            ],
-            # only images
-            [
-                images[0],
-                images[1],
-            ],
-        ]
-
-        return prompts
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = IdeficsProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
-        processor.save_pretrained(self.checkpoint_path)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
-
-        processor = IdeficsProcessor.from_pretrained(
-            self.checkpoint_path, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
-
-        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, IdeficsImageProcessor)
-
-    def test_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        prompts = self.prepare_prompts()
-
-        # test that all prompts succeeded
-        input_processor = processor(prompts, return_tensors="ms", padding="longest")
-        for key in self.input_keys:
-            # assert ops.is_tensor(input_processor[key])
-            assert mindspore.ops.is_tensor(input_processor[key])
-
-    def test_tokenizer_decode(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor, return_tensors="ms")
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_tokenizer_padding(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer(padding_side="right")
-
-        processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor, return_tensors="ms")
-
-        predicted_tokens = [
-            "<s> Describe this image.\nAssistant:<unk><unk><unk><unk><unk><unk><unk><unk><unk>",
-            "<s> Describe this image.\nAssistant:<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>",
-        ]
-        predicted_attention_masks = [
-            ([1] * 10) + ([0] * 9),
-            ([1] * 10) + ([0] * 10),
-        ]
-        prompts = [[prompt] for prompt in self.prepare_prompts()[2]]
-
-        max_length = processor(prompts, padding="max_length", truncation=True, max_length=20, return_tensors="ms")
-        longest = processor(prompts, padding="longest", truncation=True, max_length=30, return_tensors="ms")
-
-        decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1])
-        decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1])
-
-        self.assertEqual(decoded_max_length, predicted_tokens[1])
-        self.assertEqual(decoded_longest, predicted_tokens[0])
-
-        self.assertListEqual(max_length["attention_mask"][-1].tolist(), predicted_attention_masks[1])
-        self.assertListEqual(longest["attention_mask"][-1].tolist(), predicted_attention_masks[0])
-
-    def test_tokenizer_left_padding(self):
-        """Identical to test_tokenizer_padding, but with padding_side not explicitly set."""
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        predicted_tokens = [
-            "<unk><unk><unk><unk><unk><unk><unk><unk><unk><s> Describe this image.\nAssistant:",
-            "<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><s> Describe this image.\nAssistant:",
-        ]
-        predicted_attention_masks = [
-            ([0] * 9) + ([1] * 10),
-            ([0] * 10) + ([1] * 10),
-        ]
-        prompts = [[prompt] for prompt in self.prepare_prompts()[2]]
-        max_length = processor(prompts, padding="max_length", truncation=True, max_length=20)
-        longest = processor(prompts, padding="longest", truncation=True, max_length=30)
-
-        decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1])
-        decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1])
-
-        self.assertEqual(decoded_max_length, predicted_tokens[1])
-        self.assertEqual(decoded_longest, predicted_tokens[0])
-
-        self.assertListEqual(max_length["attention_mask"][-1].tolist(), predicted_attention_masks[1])
-        self.assertListEqual(longest["attention_mask"][-1].tolist(), predicted_attention_masks[0])
-
-    def test_model_input_names(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor)
-        prompts = self.prepare_prompts()
-
-        inputs = processor(prompts, padding="longest", return_tensors="ms")
-
-        # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask']
-        self.assertSetEqual(set(inputs.keys()), set(self.input_keys))
diff --git a/tests/transformers/models/imagegpt/__init__.py b/tests/transformers/models/imagegpt/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/imagegpt/test_image_processing_imagegpt.py b/tests/transformers/models/imagegpt/test_image_processing_imagegpt.py
deleted file mode 100644
index cef945d37..000000000
--- a/tests/transformers/models/imagegpt/test_image_processing_imagegpt.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================
-""" Testing suite for the ImageGpt Processing. """
-
-
-import json
-import os
-import tempfile
-import unittest
-
-import numpy as np
-from datasets import load_dataset
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_mindspore_available():
-    import mindspore as ms
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import ImageGPTImageProcessor
-
-
-class ImageGPTImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-    ):
-        size = size if size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-
-    def prepare_image_processor_dict(self):
-        return {
-            # here we create 2 clusters for the sake of simplicity
-            "clusters": np.asarray(
-                [
-                    [0.8866443634033203, 0.6618829369544983, 0.3891746401786804],
-                    [-0.6042559146881104, -0.02295008860528469, 0.5423797369003296],
-                ]
-            ),
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-        }
-
-    def expected_output_image_shape(self, images):
-        return (self.size["height"] * self.size["width"],)
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class ImageGPTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = ImageGPTImageProcessor
-
-    def setUp(self):
-        self.image_processor_tester = ImageGPTImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(
-            **self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "clusters"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42)
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
-
-    def test_image_processor_to_json_string(self):
-        image_processor = self.image_processing_class(
-            **self.image_processor_dict)
-        obj = json.loads(image_processor.to_json_string())
-        for key, value in self.image_processor_dict.items():
-            if key == "clusters":
-                self.assertTrue(np.array_equal(value, obj[key]))
-            else:
-                self.assertEqual(obj[key], value)
-
-    def test_image_processor_to_json_file(self):
-        image_processor_first = self.image_processing_class(
-            **self.image_processor_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "image_processor.json")
-            image_processor_first.to_json_file(json_file_path)
-            image_processor_second = self.image_processing_class.from_json_file(
-                json_file_path).to_dict()
-
-        image_processor_first = image_processor_first.to_dict()
-        for key, value in image_processor_first.items():
-            if key == "clusters":
-                self.assertTrue(np.array_equal(
-                    value, image_processor_second[key]))
-            else:
-                self.assertEqual(image_processor_first[key], value)
-
-    def test_image_processor_from_and_save_pretrained(self):
-        image_processor_first = self.image_processing_class(
-            **self.image_processor_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            image_processor_first.save_pretrained(tmpdirname)
-            image_processor_second = self.image_processing_class.from_pretrained(
-                tmpdirname).to_dict()
-
-        image_processor_first = image_processor_first.to_dict()
-        for key, value in image_processor_first.items():
-            if key == "clusters":
-                self.assertTrue(np.array_equal(
-                    value, image_processor_second[key]))
-            else:
-                self.assertEqual(image_processor_first[key], value)
-
-    @unittest.skip("ImageGPT requires clusters at initialization")
-    def test_init_without_params(self):
-        pass
-
-    # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(
-            **self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(
-            equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = image_processing(
-            image_inputs[0], return_tensors="ms").input_ids
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(
-            encoded_images)
-        self.assertEqual(tuple(encoded_images.shape),
-                         (1, *expected_output_image_shape))
-
-        # Test batched
-        encoded_images = image_processing(
-            image_inputs, return_tensors="ms").input_ids
-        self.assertEqual(
-            tuple(encoded_images.shape), (self.image_processor_tester.batch_size,
-                                          *expected_output_image_shape)
-        )
-
-    # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input
-    def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(
-            **self.image_processor_dict)
-        # create random numpy tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(
-            equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = image_processing(
-            image_inputs[0], return_tensors="ms").input_ids
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(
-            encoded_images)
-        self.assertEqual(tuple(encoded_images.shape),
-                         (1, *expected_output_image_shape))
-
-        # Test batched
-        encoded_images = image_processing(
-            image_inputs, return_tensors="ms").input_ids
-        self.assertEqual(
-            tuple(encoded_images.shape), (self.image_processor_tester.batch_size,
-                                          *expected_output_image_shape)
-        )
-
-    @unittest.skip("ImageGPT assumes clusters for 3 channels")
-    def test_call_numpy_4_channels(self):
-        pass
-
-    # Override the test from ImageProcessingTestMixin as ImageGPT model takes input_ids as input
-    def test_call_pytorch(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(
-            **self.image_processor_dict)
-        # create random PyTorch tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(
-            equal_resolution=False, torchify=True)
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(
-            image_inputs)
-
-        for image in image_inputs:
-            self.assertIsInstance(image, ms.Tensor)
-
-        # Test not batched input
-        encoded_images = image_processing(
-            image_inputs[0], return_tensors="ms").input_ids
-        self.assertEqual(tuple(encoded_images.shape),
-                         (1, *expected_output_image_shape))
-
-        # Test batched
-        encoded_images = image_processing(
-            image_inputs, return_tensors="ms").input_ids
-        self.assertEqual(
-            tuple(encoded_images.shape),
-            (self.image_processor_tester.batch_size, *expected_output_image_shape),
-        )
-
-
-def prepare_images():
-    # we use revision="refs/pr/1" until the PR is merged
-    # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
-    dataset = load_dataset(
-        "hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
-
-    image1 = dataset[4]["image"]
-    image2 = dataset[5]["image"]
-
-    images = [image1, image2]
-
-    return images
-
-
-@require_vision
-@require_mindspore
-class ImageGPTImageProcessorIntegrationTest(unittest.TestCase):
-    @slow
-    def test_image(self):
-        image_processing = ImageGPTImageProcessor.from_pretrained(
-            "openai/imagegpt-small")
-
-        images = prepare_images()
-
-        # test non-batched
-        encoding = image_processing(images[0], return_tensors="ms")
-
-        self.assertIsInstance(encoding.input_ids, ms.Tensor)
-        self.assertEqual(encoding.input_ids.shape, (1, 1024))
-
-        expected_slice = [306, 191, 191]
-        self.assertEqual(encoding.input_ids[0, :3].tolist(), expected_slice)
-
-        # test batched
-        encoding = image_processing(images, return_tensors="ms")
-
-        self.assertIsInstance(encoding.input_ids, ms.Tensor)
-        self.assertEqual(encoding.input_ids.shape, (2, 1024))
-
-        expected_slice = [303, 13, 13]
-        self.assertEqual(encoding.input_ids[1, -3:].tolist(), expected_slice)
diff --git a/tests/transformers/models/imagegpt/test_modeling_imagegpt.py b/tests/transformers/models/imagegpt/test_modeling_imagegpt.py
deleted file mode 100644
index 47375913e..000000000
--- a/tests/transformers/models/imagegpt/test_modeling_imagegpt.py
+++ /dev/null
@@ -1,527 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import copy
-import inspect
-import os
-import tempfile
-import unittest
-
-from mindnlp.transformers import ImageGPTConfig
-from mindnlp.utils.testing_utils import require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available, require_mindspore
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import (
-        ImageGPTForCausalImageModeling,
-        ImageGPTForImageClassification,
-        ImageGPTModel,
-    )
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import ImageGPTImageProcessor
-
-
-class ImageGPTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_token_type_ids=True,
-        use_input_mask=True,
-        use_labels=True,
-        use_mc_token_ids=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.use_mc_token_ids = use_mc_token_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = None
-
-    def get_large_model_config(self):
-        return ImageGPTConfig.from_pretrained("imagegpt")
-
-    def prepare_config_and_inputs(
-        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
-    ):
-        pixel_values = ids_tensor([self.batch_size, self.seq_length], self.vocab_size - 1)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        mc_token_ids = None
-        if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config(
-            gradient_checkpointing=gradient_checkpointing,
-            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-            reorder_and_upcast_attn=reorder_and_upcast_attn,
-        )
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            pixel_values,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(
-        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
-    ):
-        return ImageGPTConfig(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            n_inner=self.intermediate_size,
-            activation_function=self.hidden_act,
-            resid_pdrop=self.hidden_dropout_prob,
-            attn_pdrop=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            use_cache=True,
-            gradient_checkpointing=gradient_checkpointing,
-            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-            reorder_and_upcast_attn=reorder_and_upcast_attn,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 513
-        config.max_position_embeddings = 1024
-        return config
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            pixel_values,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            pixel_values,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_imagegpt_model(self, config, pixel_values, input_mask, head_mask, token_type_ids, *args):
-        model = ImageGPTModel(config=config)
-        model.eval()
-
-        result = model(pixel_values, token_type_ids=token_type_ids, head_mask=head_mask)
-        result = model(pixel_values, token_type_ids=token_type_ids)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(len(result.past_key_values), config.n_layer)
-
-    def create_and_check_lm_head_model(self, config, pixel_values, input_mask, head_mask, token_type_ids, *args):
-        model = ImageGPTForCausalImageModeling(config)
-        model.eval()
-
-        labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size - 1)
-        result = model(pixel_values, token_type_ids=token_type_ids, labels=labels)
-        self.parent.assertEqual(result.loss.shape, ())
-        # ImageGPTForCausalImageModeling doens't have tied input- and output embeddings
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size - 1))
-
-    def create_and_check_imagegpt_for_image_classification(
-        self, config, pixel_values, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-    ):
-        config.num_labels = self.num_labels
-        model = ImageGPTForImageClassification(config)
-        model.eval()
-        result = model(pixel_values, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            pixel_values,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "token_type_ids": token_type_ids,
-            "head_mask": head_mask,
-        }
-
-        return config, inputs_dict
-
-
-@require_mindspore
-class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (ImageGPTForCausalImageModeling, ImageGPTForImageClassification, ImageGPTModel) if is_mindspore_available() else ()
-    )
-    all_generative_model_classes = (ImageGPTForCausalImageModeling,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"image-feature-extraction": ImageGPTModel, "image-classification": ImageGPTForImageClassification}
-        if is_mindspore_available()
-        else {}
-    )
-    test_missing_keys = False
-    input_name = "pixel_values"
-
-    # as ImageGPTForImageClassification isn't included in any auto mapping, we add labels here
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "ImageGPTForImageClassification":
-                inputs_dict["labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-
-        return inputs_dict
-
-    # we overwrite the _check_scores method of GenerationTesterMixin, as ImageGPTForCausalImageModeling doesn't have tied input- and output embeddings
-    def _check_scores(self, batch_size, scores, length, config):
-        expected_shape = (batch_size, config.vocab_size - 1)
-        self.assertIsInstance(scores, tuple)
-        self.assertEqual(len(scores), length)
-        self.assertListEqual([iter_scores.shape for iter_scores in scores], [expected_shape] * len(scores))
-
-    def setUp(self):
-        self.model_tester = ImageGPTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ImageGPTConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_imagegpt_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_imagegpt_model(*config_and_inputs)
-
-    def test_imagegpt_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    def test_imagegpt_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_imagegpt_for_image_classification(*config_and_inputs)
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openai/imagegpt-small"
-        model = ImageGPTModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_ids"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_resize_tokens_embeddings(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            self.skipTest(reason="test_resize_embeddings is set to False")
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-    
-            if self.model_tester.is_training is False:
-                model.eval()
-
-            model_vocab_size = config.vocab_size
-            # Retrieve the embeddings and clone theme
-            model_embed = model.resize_token_embeddings(model_vocab_size)
-            cloned_embeddings = model_embed.weight.clone()
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["pixel_values"] = inputs_dict["pixel_values"].clamp(max=model_vocab_size - 15 - 1)
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                if p1.ne(p2).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-    def test_resize_embeddings_untied(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            self.skipTest(reason="test_resize_embeddings is set to False")
-
-        original_config.tie_word_embeddings = False
-
-        # if model cannot untied embeddings -> leave test
-        if original_config.tie_word_embeddings:
-            self.skipTest(reason="tie_word_embeddings is set to False")
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            # if no output embeddings -> leave test
-            if model.get_output_embeddings() is None:
-                continue
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_vocab_size = config.vocab_size
-            model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["pixel_values"] = inputs_dict["pixel_values"].clamp(max=model_vocab_size - 15 - 1)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            pixel_values = inputs["pixel_values"]
-            del inputs["pixel_values"]
-
-            wte = model.get_input_embeddings()
-            inputs["inputs_embeds"] = wte(pixel_values)
-
-            with no_grad():
-                model(**inputs)[0]
-
-    # override because ImageGPT main input name is `pixel_values`
-    # NOTE: in latest transformers this is deprecated, `input_ids` should be used. TODO
-    def test_inputs_embeds_matches_input_ids(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            with no_grad():
-                out_ids = model(**inputs)[0]
-
-            pixel_values = inputs["pixel_values"]
-            del inputs["pixel_values"]
-
-            wte = model.get_input_embeddings()
-            inputs["inputs_embeds"] = wte(pixel_values)
-
-            with no_grad():
-                out_embeds = model(**inputs)[0]
-
-            self.assertTrue(ops.allclose(out_embeds, out_ids))
-
-    @unittest.skip(reason="The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
-    def test_left_padding_compatibility(self):
-        pass
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class ImageGPTModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return ImageGPTImageProcessor.from_pretrained("openai/imagegpt-small") if is_vision_available() else None
-
-    @slow
-    def test_inference_causal_lm_head(self):
-        model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1024, 512)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[2.3445, 2.6889, 2.7313], [1.0530, 1.2416, 0.5699], [0.2205, 0.7749, 0.3953]]
-        )
-
-        self.assertTrue(ops.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/instructblip/__init__.py b/tests/transformers/models/instructblip/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/instructblip/test_modeling_instructblip.py b/tests/transformers/models/instructblip/test_modeling_instructblip.py
deleted file mode 100644
index abff42c0c..000000000
--- a/tests/transformers/models/instructblip/test_modeling_instructblip.py
+++ /dev/null
@@ -1,666 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch InstructBLIP model."""
-
-import inspect
-import tempfile
-import unittest
-
-import numpy as np
-import requests
-
-from mindnlp.transformers import (
-    CONFIG_MAPPING,
-    InstructBlipConfig,
-    InstructBlipProcessor,
-    InstructBlipQFormerConfig,
-    InstructBlipVisionConfig,
-)
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import InstructBlipForConditionalGeneration, InstructBlipVisionModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-
-class InstructBlipVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=1e-10,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in case of a vision transformer, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return InstructBlipVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = InstructBlipVisionModel(config=config)
-        model.eval()
-        with no_grad():
-            result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class InstructBlipVisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as InstructBLIP's vision encoder does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (InstructBlipVisionModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = InstructBlipVisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=InstructBlipVisionConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="InstructBLIP's vision encoder does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="InstructBlipVisionModel is an internal building block, doesn't support standalone training")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="InstructBlipVisionModel is an internal building block, doesn't support standalone training")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="InstructBlipVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="InstructBlipVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/instructblip-flan-t5-xl"
-        model = InstructBlipVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class InstructBlipQFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        bos_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        qformer_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-            qformer_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[batch_idx, :int(start_index)] = 1
-                input_mask[batch_idx, int(start_index):] = 0
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, qformer_input_ids, qformer_attention_mask
-
-    def get_config(self):
-        return InstructBlipQFormerConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            bos_token_id=self.bos_token_id,
-        )
-
-
-# this class is based on `OPTModelTester` found in tests/models/opt/test_modeling_opt.py
-class InstructBlipTextModelDecoderOnlyTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        embed_dim=16,
-        num_labels=3,
-        word_embed_proj_dim=16,
-        type_sequence_label_size=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.embed_dim = embed_dim
-        self.num_labels = num_labels
-        self.type_sequence_label_size = type_sequence_label_size
-        self.word_embed_proj_dim = word_embed_proj_dim
-        self.is_encoder_decoder = False
-
-    def prepare_config_and_inputs(self):
-        config = self.get_config()
-
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3)
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        attention_mask = input_ids.ne(self.pad_token_id)
-
-        return config, input_ids, attention_mask
-
-    def get_config(self):
-        return CONFIG_MAPPING["opt"](
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            embed_dim=self.embed_dim,
-            is_encoder_decoder=False,
-            word_embed_proj_dim=self.word_embed_proj_dim,
-        )
-
-
-# this model tester uses a decoder-only language model (OPT)
-class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
-    def __init__(
-        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
-    ):
-        if vision_kwargs is None:
-            vision_kwargs = {}
-        if qformer_kwargs is None:
-            qformer_kwargs = {}
-        if text_kwargs is None:
-            text_kwargs = {}
-
-        self.parent = parent
-        self.vision_model_tester = InstructBlipVisionModelTester(parent, **vision_kwargs)
-        self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs)
-        self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
-        self.is_training = is_training
-        self.num_query_tokens = num_query_tokens
-
-    def prepare_config_and_inputs(self):
-        _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-        _, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs()
-        _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values
-
-    def get_config(self):
-        return InstructBlipConfig.from_vision_qformer_text_configs(
-            vision_config=self.vision_model_tester.get_config(),
-            qformer_config=self.qformer_model_tester.get_config(),
-            text_config=self.text_model_tester.get_config(),
-            num_query_tokens=self.num_query_tokens,
-        )
-
-    def create_and_check_for_conditional_generation(
-        self, config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values
-    ):
-        model = InstructBlipForConditionalGeneration(config).eval()
-        with no_grad():
-            result = model(
-                pixel_values,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                qformer_input_ids=qformer_input_ids,
-                qformer_attention_mask=qformer_attention_mask,
-            )
-
-        expected_seq_length = self.num_query_tokens + self.text_model_tester.seq_length
-        self.parent.assertEqual(
-            result.logits.shape,
-            (self.vision_model_tester.batch_size, expected_seq_length, self.text_model_tester.vocab_size),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "qformer_input_ids": qformer_input_ids,
-            "qformer_attention_mask": qformer_attention_mask,
-            "labels": input_ids,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (InstructBlipForConditionalGeneration,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self)
-
-    def test_for_conditional_generation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="InstructBlipForConditionalGeneration doesn't support inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Tied weights are tested in individual model tests")
-    def test_tied_weights_keys(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="InstructBlipModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="There's no base InstructBlipModel")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="There's no base InstructBlipModel")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_load_vision_qformer_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save InstructBlipConfig and check if we can load InstructBlipVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = InstructBlipVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save InstructBlipConfig and check if we can load InstructBlipQFormerConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            qformer_config = InstructBlipQFormerConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "Salesforce/instructblip-flan-t5-xl"
-        model = InstructBlipForConditionalGeneration.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "https://huggingface.co/hf-internal-testing/blip-test-image/resolve/main/demo.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-    return image
-
-
-@require_vision
-@require_mindspore
-@slow
-class InstructBlipModelIntegrationTest(unittest.TestCase):
-    # @require_bitsandbytes
-    # @require_accelerate
-    # def test_inference_vicuna_7b(self):
-    #     processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
-    #     model = InstructBlipForConditionalGeneration.from_pretrained(
-    #         "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True
-    #     )
-
-    #     url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
-    #     image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    #     prompt = "What is unusual about this image?"
-    #     inputs = processor(images=image, text=prompt, return_tensors="ms").to(mindspore.float16)
-
-    #     # verify logits
-    #     with no_grad():
-    #         logits = model(**inputs).logits
-
-    #     expected_slice = mindspore.tensor(
-    #         [[-3.3926, -12.2969, 8.4922], [-5.0195, -11.9531, 8.1406], [-4.0039, -13.3594, 9.2578]],
-    #     )
-
-    #     self.assertTrue(ops.allclose(logits[0, :3, :3].float(), expected_slice, atol=1e-3))
-
-    #     # verify generation
-    #     outputs = model.generate(**inputs, max_new_tokens=30)
-    #     generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
-
-    #     expected_outputs = [2, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 372, 338, 19500, 1623, 263, 19587, 4272]  # fmt: off
-
-    #     self.assertEqual(outputs[0].tolist(), expected_outputs)
-    #     self.assertEqual(
-    #         generated_text,
-    #         "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while it is driving down a busy city",
-    #     )
-
-    def test_inference_flant5_xl(self):
-        processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")
-        model = InstructBlipForConditionalGeneration.from_pretrained(
-            "Salesforce/instructblip-flan-t5-xl",
-            ms_dtype=mindspore.bfloat16,
-            low_cpu_mem_usage=True,
-        )
-
-        url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
-        image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-        prompt = "What is unusual about this image?"
-        inputs = processor(images=image, text=prompt, return_tensors="ms")
-
-        for k, v in inputs.items():
-            if ops.is_floating_point(v):
-                inputs[k] = v.to(mindspore.bfloat16)
-
-        outputs = model.generate(
-            **inputs,
-            do_sample=False,
-            num_beams=5,
-            max_length=256,
-            min_length=1,
-            top_p=0.9,
-            repetition_penalty=1.5,
-            length_penalty=1.0,
-            temperature=1,
-        )
-        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-
-        expected_outputs = [0, 37, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 3, 9, 2756, 4459, 6177, 6, 11, 3, 88, 19, 338, 46, 3575, 53, 1476, 12, 743, 112, 2491, 5, 37, 1023, 19, 7225, 788, 12, 8, 685, 24, 34, 1267, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 94, 19, 487, 24, 8, 388, 19, 1119, 12, 1097, 540, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 6, 68, 34, 19, 92, 487, 24, 3, 88, 19, 1119, 12, 1097, 97, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 3, 13865, 13, 8, 1053, 21, 8, 388, 31, 7, 2874, 6, 34, 19, 964, 24, 3, 88, 19, 1119, 12, 1097, 97, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 1]  # fmt: skip
-
-        expected_outputs = [0, 37, 7225, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 46, 3575, 53, 1476, 5223, 12, 34, 6, 15495, 24, 3, 88, 19, 692, 112, 293, 10428, 44, 234, 1066, 145, 338, 3, 9, 50, 1106, 3522, 144, 42, 2192, 7919, 31, 7, 5, 37, 1023, 92, 1267, 3, 9, 381, 13, 119, 3203, 16, 8, 2458, 6, 379, 14264, 6, 9256, 7, 6, 11, 11718, 7, 5, 1]  # fmt: skip
-
-        self.assertEqual(outputs[0].tolist(), expected_outputs)
-        self.assertEqual(
-            generated_text,
-            "The unusual image depicts a man ironing clothes on the back of a yellow van in the middle of a busy city street. The man is wearing a yellow shirt with an ironing board attached to it, suggesting that he is doing his own laundry at home rather than using a laundromat or dry cleaner's. The image also shows a number of other vehicles in the background, including buses, taxis, and motorcycles.",
-        )
-
-    def test_inference_interpolate_pos_encoding(self):
-        processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")
-        model = InstructBlipForConditionalGeneration.from_pretrained(
-            "Salesforce/instructblip-flan-t5-xl",
-            ms_dtype=mindspore.bfloat16,
-            low_cpu_mem_usage=True,
-        )
-        processor.image_processor.size = {"height": 500, "width": 500}
-
-        image = prepare_img()
-        prompt = "What's in the image?"
-        inputs = processor(images=image, text=prompt, return_tensors="ms")
-
-        predictions = model.generate(**inputs, interpolate_pos_encoding=True)
-        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
-
-        self.assertEqual(
-            predictions[0].tolist(), [0, 37, 1023, 753, 3, 9, 2335, 3823, 30, 8, 2608, 28, 3, 9, 1782, 5, 1]
-        )
-        self.assertEqual(generated_text, "The image features a woman sitting on the beach with a dog.")
-
-    def test_expansion_in_processing(self):
-        processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")
-        model = InstructBlipForConditionalGeneration.from_pretrained(
-            "Salesforce/instructblip-flan-t5-xl",
-            ms_dtype=mindspore.bfloat16,
-            low_cpu_mem_usage=True,
-        )
-
-        image = prepare_img()
-        prompt = "What's in the image?"
-
-        # Make sure we will go the legacy path by setting these args to None
-        processor.num_query_tokens = None
-        model.config.image_token_index = None
-        inputs = processor(images=image, text=prompt, return_tensors="ms").to(dtype=mindspore.float16)
-
-        predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)
-        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
-
-        # Add args to the config to trigger new logic when inputs are expanded in processing file
-        processor.num_query_tokens = model.config.num_query_tokens
-        processor.tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
-        model.config.image_token_index = len(processor.tokenizer) - 1
-        model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64)
-
-        # Generate again with new inputs
-        inputs = processor(images=image, text=prompt, return_tensors="ms").to(dtype=mindspore.float16)
-        predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15)
-        generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip()
-
-        self.assertTrue(generated_text_expanded == generated_text)
\ No newline at end of file
diff --git a/tests/transformers/models/internlm/__init__.py b/tests/transformers/models/internlm/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/internlm/test_modeling_internlm.py b/tests/transformers/models/internlm/test_modeling_internlm.py
deleted file mode 100644
index f01e7a270..000000000
--- a/tests/transformers/models/internlm/test_modeling_internlm.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""Test InternLM"""
-import gc
-import os
-import unittest
-
-import mindspore
-import numpy as np
-from mindspore import Tensor
-from mindnlp.transformers.models.internlm import InternLMConfig, InternLMModel, InternLMForCausalLM, \
-    InternLMForSequenceClassification
-from mindnlp.utils.testing_utils import slow
-from tests.common import MindNLPTestCase
-
-class TestModelingBaiChuan(MindNLPTestCase):
-    r"""
-    Test BaiChuan
-    """
-
-    def setUp(self):
-        """
-        Set up.
-        """
-        self.config_7b = InternLMConfig(vocab_size=1000, num_hidden_layers=2)
-
-    @slow
-    def test_7b_model(self):
-        r"""
-        Test Model
-        """
-        model = InternLMModel(self.config_7b)
-        input_ids = Tensor(np.random.randint(0, 100, (1, 128)), mindspore.int32)
-        outputs = model(input_ids=input_ids)
-        assert outputs[0].shape == (1, 128, 4096)
-
-
-    @slow
-    def test_internlm_for_causal_lm_7b(self):
-        r"""
-        Test InternLMForCausalLM
-        """
-        model = InternLMForCausalLM(self.config_7b)
-        input_ids = Tensor(np.random.randint(0, 100, (1, 128)), mindspore.int32)
-        outputs = model(input_ids=input_ids)
-        assert outputs[0].shape == (1, 128, 1000)
-    
-    @slow
-    def test_internlm_for_sequence_classification_7b(self):
-        r"""
-        Test InternLMForSequenceClassification
-        """
-        model = InternLMForSequenceClassification(self.config_7b)
-        input_ids = Tensor(np.random.randint(0, 100, (1, 128)), mindspore.int32)
-        outputs = model(input_ids=input_ids)
-        assert outputs[0].shape == (1, 2)
diff --git a/tests/transformers/models/jamba/__init__.py b/tests/transformers/models/jamba/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/jamba/test_modeling_jamba.py b/tests/transformers/models/jamba/test_modeling_jamba.py
deleted file mode 100644
index 9baf199f5..000000000
--- a/tests/transformers/models/jamba/test_modeling_jamba.py
+++ /dev/null
@@ -1,537 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Jamba model."""
-
-import math
-import tempfile
-import unittest
-
-import pytest
-from parameterized import parameterized
-
-from mindspore.common.api import _no_grad
-from mindnlp.transformers import AutoTokenizer, JambaConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-    is_mindspore_available
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import (
-        JambaForCausalLM,
-        JambaForSequenceClassification,
-        JambaModel,
-    )
-    from mindnlp.transformers.models.jamba.modeling_jamba import (
-        HybridMambaAttentionDynamicCache,
-    )
-
-class JambaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=5,
-        attn_layer_offset=1,
-        attn_layer_period=8,
-        num_attention_heads=4,
-        num_key_value_heads=2,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.attn_layer_offset = attn_layer_offset
-        self.attn_layer_period = attn_layer_period
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return JambaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            attn_layer_offset=self.attn_layer_offset,
-            attn_layer_period=self.attn_layer_period,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_key_value_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=True,
-            initializer_range=self.initializer_range,
-            use_mamba_kernels=False,
-            num_experts=2,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
-        model = JambaModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = JambaForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids, labels=token_labels)
-        result = model(input_ids)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = JambaForCausalLM(config=config)
-        model.eval()
-
-        # first forward pass
-        # Attention: Jamba needs the cache to be initialized to return a cache!
-        past_key_values = HybridMambaAttentionDynamicCache(
-            config, input_ids.shape[0], model.dtype
-        )
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            past_key_values=past_key_values,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-            cache_position=ops.arange(
-                input_ids.shape[1], input_ids.shape[1] + next_tokens.shape[1]
-            ),
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = JambaForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class JambaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            JambaModel,
-            JambaForCausalLM,
-            JambaForSequenceClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (JambaForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": JambaModel,
-            "text-classification": JambaForSequenceClassification,
-            "text-generation": JambaForCausalLM,
-            "zero-shot": JambaForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = JambaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=JambaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_casual_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_load_balancing_loss(self):
-        r"""
-        Let's make sure we can actually compute the loss and do a backward on it.
-        """
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.num_experts = 16
-        config.output_router_logits = True
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(config.pad_token_id)
-        model = JambaForCausalLM(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask)
-        bs, seqlen = input_ids.shape
-        self.assertEqual(result.router_logits[0].shape, (bs * seqlen, config.num_experts))
-        self.assertTrue(ops.allclose(result.aux_loss, mindspore.tensor(2, dtype=mindspore.float32), rtol=1e-2, atol=1e-2))
-
-        # First, we make sure that adding padding tokens doesn't change the loss
-        # loss(input_ids, attention_mask=None) == loss(input_ids + padding, attention_mask=attention_mask_with_padding)
-        pad_length = 1000
-        # Add padding tokens to input_ids
-        padding_block = config.pad_token_id * ops.ones(input_ids.shape[0], pad_length, dtype=input_ids.dtype)
-        padded_input_ids = ops.cat((padding_block, input_ids), dim=1)  # this is to simulate padding to the left
-        padded_attention_mask = padded_input_ids.ne(config.pad_token_id)
-
-        padded_result = model(padded_input_ids, attention_mask=padded_attention_mask)
-        self.assertTrue(ops.allclose(result.aux_loss, padded_result.aux_loss, rtol=1e-4, atol=1e-4))
-
-        # We make sure that the loss of including padding tokens != the loss without padding tokens
-        # if attention_mask=None --> we don't exclude padding tokens
-        include_padding_result = model(padded_input_ids, attention_mask=None)
-
-        # This is to mimic torch.testing.assert_not_close
-        self.assertNotAlmostEqual(include_padding_result.aux_loss.item(), result.aux_loss.item())
-
-    def test_initialization(self):
-        r"""
-        Overriding the test_initialization test as the A_log and D params of the Mamba block are initialized differently
-        """
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if "A_log" in name:
-                        A = ops.arange(1, config.mamba_d_state + 1, dtype=mindspore.float32)[None, :]
-                        self.assertTrue(ops.allclose(param.data, ops.log(A), atol=1e-5, rtol=1e-5))
-                    elif "D" in name:
-                        # check if it's a ones like
-                        self.assertTrue(ops.allclose(param.data, ops.ones_like(param.data), atol=1e-5, rtol=1e-5))
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def test_mismatched_shapes_have_properly_initialized_weights(self):
-        r"""
-        Overriding the test_mismatched_shapes_have_properly_initialized_weights test because A_log and D params of the
-        Mamba block are initialized differently and we tested that in test_initialization
-        """
-        self.skipTest(reason="Cumbersome and redundant for Jamba")
-
-    def test_attention_outputs(self):
-        r"""
-        Overriding the test_attention_outputs test as the Jamba model outputs attention only for its attention layers
-        """
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        expected_num_attentions = math.ceil(
-            (self.model_tester.num_hidden_layers - self.model_tester.attn_layer_offset)
-            / self.model_tester.attn_layer_period
-        )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-
-            with _no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with _no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with _no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), expected_num_attentions)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-
-    @unittest.skip(reason="Jamba has its own special cache type")
-    @parameterized.expand([(1, False), (1, True), (4, False)])
-    def test_new_cache_format(self, num_beams, do_sample):
-        pass
-
-
-@require_mindspore
-class JambaModelIntegrationTest(unittest.TestCase):
-    model = None
-    tokenizer = None
-
-    @classmethod
-    def setUpClass(cls):
-        model_id = "ai21labs/Jamba-tiny-random"
-        cls.model = JambaForCausalLM.from_pretrained(model_id, ms_dtype=mindspore.float16, low_cpu_mem_usage=True)
-        cls.tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-    @slow
-    def test_simple_generate(self):
-        self.model
-
-        input_ids = self.tokenizer("Hey how are you doing on this lovely evening?", return_tensors="ms")[
-            "input_ids"
-        ]
-        out = self.model.generate(input_ids, do_sample=False, max_new_tokens=10)
-        output_sentence = self.tokenizer.decode(out[0, :])
-        self.assertEqual(
-            output_sentence,
-            "<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh Hebrew cases Cats",
-        )
-
-        with _no_grad():
-            logits = self.model(input_ids=input_ids).logits
-
-        EXPECTED_LOGITS_NO_GRAD = mindspore.tensor(
-            [
-                0.0140, -0.2246,  0.0408, -0.1016,  0.0471,  0.2715, -0.1465,  0.1631,
-               -0.2949, -0.0297,  0.0250, -0.5586, -0.2139, -0.1426, -0.1602,  0.1309,
-                0.0703,  0.2236,  0.1729, -0.2285, -0.1152, -0.1177, -0.1367,  0.0289,
-                0.1245,  0.2363,  0.0442,  0.1094, -0.1348, -0.2295,  0.1494, -0.3945,
-                0.1777, -0.4570, -0.0408,  0.2412,  0.1562, -0.1943,  0.2373, -0.0593
-            ]
-            , dtype=mindspore.float32)  # fmt: skip
-
-        self.assertTrue(ops.allclose(logits[0, -1, :40], EXPECTED_LOGITS_NO_GRAD, rtol=1e-3, atol=1e-3))
-
-    @slow
-    def test_simple_batched_generate_with_padding(self):
-        self.model
-
-        inputs = self.tokenizer(
-            ["Hey how are you doing on this lovely evening?", "Tell me a story"], padding=True, return_tensors="ms"
-        )
-        out = self.model.generate(**inputs, do_sample=False, max_new_tokens=10)
-        output_sentences = self.tokenizer.batch_decode(out)
-        self.assertEqual(
-            output_sentences[0],
-            "<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh Hebrew cases Cats",
-        )
-        self.assertEqual(
-            output_sentences[1],
-            "<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|startoftext|>Tell me a storyptus Nets Madison El chamadamodern updximVaparsed",
-        )
-
-        with _no_grad():
-            logits = self.model(input_ids=inputs["input_ids"]).logits
-
-        EXPECTED_LOGITS_NO_GRAD_0 = mindspore.tensor(
-            [
-                0.0140, -0.2246,  0.0408, -0.1016,  0.0471,  0.2715, -0.1465,  0.1631,
-               -0.2949, -0.0297,  0.0250, -0.5586, -0.2139, -0.1426, -0.1602,  0.1309,
-                0.0703,  0.2236,  0.1729, -0.2285, -0.1152, -0.1177, -0.1367,  0.0289,
-                0.1245,  0.2363,  0.0442,  0.1094, -0.1348, -0.2295,  0.1494, -0.3945,
-                0.1777, -0.4570, -0.0408,  0.2412,  0.1562, -0.1943,  0.2373, -0.0593
-            ]
-            , dtype=mindspore.float32)  # fmt: skip
-
-        EXPECTED_LOGITS_NO_GRAD_1 = mindspore.tensor(
-            [
-               -0.1289,  0.2363, -0.4180, -0.0302, -0.0476,  0.0327,  0.2578,  0.0874,
-                0.1484,  0.2305, -0.1152, -0.1396, -0.1494, -0.1113, -0.0021, -0.2832,
-                0.2002, -0.2676,  0.0598, -0.1982, -0.2539, -0.1133, -0.1973,  0.2148,
-                0.0559,  0.1670,  0.1846,  0.1270,  0.1680, -0.1250, -0.2656, -0.2871,
-                0.2344,  0.2637,  0.0510, -0.1855,  0.2158, -0.1289,  0.1758,  0.0074
-            ]
-            , dtype=mindspore.float32)  # fmt: skip
-
-        self.assertTrue(ops.allclose(logits[0, -1, :40], EXPECTED_LOGITS_NO_GRAD_0, rtol=1e-3, atol=1e-3))
-        self.assertTrue(ops.allclose(logits[1, -1, :40], EXPECTED_LOGITS_NO_GRAD_1, rtol=1e-3, atol=1e-3))
\ No newline at end of file
diff --git a/tests/transformers/models/jetmoe/__init__.py b/tests/transformers/models/jetmoe/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/jetmoe/test_modeling_jetmoe.py b/tests/transformers/models/jetmoe/test_modeling_jetmoe.py
deleted file mode 100644
index bad20d986..000000000
--- a/tests/transformers/models/jetmoe/test_modeling_jetmoe.py
+++ /dev/null
@@ -1,430 +0,0 @@
-# coding=utf-8
-# Copyright 2024 JetMoe AI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch JetMoe model."""
-
-import gc
-import tempfile
-import unittest
-
-import pytest
-from parameterized import parameterized
-
-from mindnlp.transformers import AutoTokenizer, JetMoeConfig, is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    is_flaky,
-    require_mindspore,
-    require_mindspore_gpu,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        JetMoeForCausalLM,
-        JetMoeForSequenceClassification,
-        JetMoeModel,
-    )
-
-
-class JetMoeModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_key_value_heads=2,
-        kv_channels=8,
-        intermediate_size=37,
-        hidden_act="silu",
-        num_local_experts=4,
-        num_experts_per_tok=2,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.kv_channels = kv_channels
-        self.num_attention_heads = num_key_value_heads * num_experts_per_tok
-        self.num_key_value_heads = num_key_value_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.num_local_experts = num_local_experts
-        self.num_experts_per_tok = num_experts_per_tok
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ops.ones(self.batch_size, self.seq_length)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return JetMoeConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_key_value_heads=self.num_key_value_heads,
-            kv_channels=self.kv_channels,
-            intermediate_size=self.intermediate_size,
-            activation_function=self.hidden_act,
-            num_local_experts=self.num_local_experts,
-            num_experts_per_tok=self.num_experts_per_tok,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = JetMoeModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = JetMoeModel(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = JetMoeForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = JetMoeForCausalLM(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class JetMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (JetMoeModel, JetMoeForCausalLM, JetMoeForSequenceClassification) if is_mindspore_available() else ()
-    )
-    all_generative_model_classes = (JetMoeForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": JetMoeModel,
-            "text-classification": JetMoeForSequenceClassification,
-            "text-generation": JetMoeForCausalLM,
-            "zero-shot": JetMoeForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-    test_mismatched_shapes = False
-    test_cpu_offload = False
-    test_disk_offload_bin = False
-    test_disk_offload_safetensors = False
-
-    @parameterized.expand([(1, False), (1, True), (4, False)])
-    def test_new_cache_format(self, num_beams, do_sample):
-        pass
-
-    def setUp(self):
-        self.model_tester = JetMoeModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=JetMoeConfig, common_properties=["hidden_size", "num_hidden_layers"]
-        )
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_config
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model_various_embeddings
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model with llama->jetmoe, Llama->JetMoe
-    def test_jetmoe_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = JetMoeForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model_for_single_label with llama->jetmoe, Llama->JetMoe
-    def test_jetmoe_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = JetMoeForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model_for_multi_label with llama->jetmoe, Llama->JetMoe
-    def test_jetmoe_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(mindspore.float32)
-        model = JetMoeForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @unittest.skip(reason="JetMoe buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="JetMoe uses MoA on all models so the KV cache is a non standard format")
-    def test_past_key_values_format(self):
-        pass
-
-
-@require_mindspore
-class JetMoeIntegrationTest(unittest.TestCase):
-    @slow
-    def test_model_8b_logits(self):
-        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
-        model = JetMoeForCausalLM.from_pretrained("jetmoe/jetmoe-8b", device_map="auto")
-        input_ids = mindspore.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
-        with no_grad():
-            out = model(input_ids).logits.cpu()
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = mindspore.tensor([[0.2507, -2.7073, -1.3445, -1.9363, -1.7216, -1.7370, -1.9054, -1.9792]])
-        self.assertEqual(ops.allclose(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2))
-        # slicing logits[0, 0, 0:30]
-        EXPECTED_SLICE = mindspore.tensor([-3.3689,  5.9006,  5.7450, -1.7012, -4.7072, -4.7071, -4.7071, -4.7071, -4.7072, -4.7072, -4.7072, -4.7071,  3.8321,  9.1746, -4.7071, -4.7072, -4.7071, -4.7072, -4.7071, -4.7072, -4.7071, -4.7071, -4.7071, -4.7071, -4.7071, -4.7071, -4.7071, -4.7071, -4.7071, -4.7071])  # fmt: skip
-        self.assertEqual(ops.allclose(out[0, 0, :30], EXPECTED_SLICE, atol=1e-4, rtol=1e-4))
-
-        del model
-        gc.collect()
-
-    @slow
-    def test_model_8b_generation(self):
-        EXPECTED_TEXT_COMPLETION = """My favourite condiment is ....\nI love ketchup. I love"""
-        prompt = "My favourite condiment is "
-        tokenizer = AutoTokenizer.from_pretrained("jetmoe/jetmoe-8b", use_fast=False)
-        model = JetMoeForCausalLM.from_pretrained("jetmoe/jetmoe-8b", device_map="auto")
-        input_ids = tokenizer.encode(prompt, return_tensors="ms").to(model.model.embed_tokens.weight.device)
-
-        # greedy generation outputs
-        generated_ids = model.generate(input_ids, max_new_tokens=10, temperature=0)
-        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
-
-        del model
-        gc.collect()
-
-    @slow
-    def test_model_8b_batched_generation(self):
-        EXPECTED_TEXT_COMPLETION = [
-            """My favourite condiment is ....\nI love ketchup. I love""",
-            """My favourite 2018 Christmas present was a new pair""",
-        ]
-        prompt = [
-            "My favourite condiment is ",
-            "My favourite ",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("jetmoe/jetmoe-8b", use_fast=False)
-        model = JetMoeForCausalLM.from_pretrained("jetmoe/jetmoe-8b", device_map="auto")
-        input_ids = tokenizer(prompt, return_tensors="ms", padding=True).to(model.model.embed_tokens.weight.device)
-        print(input_ids)
-
-        # greedy generation outputs
-        generated_ids = model.generate(**input_ids, max_new_tokens=10, temperature=0)
-        text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-        print(text)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
-
-        del model
-        gc.collect()
diff --git a/tests/transformers/models/kosmos2/__init__.py b/tests/transformers/models/kosmos2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/kosmos2/test_modeling_kosmos2.py b/tests/transformers/models/kosmos2/test_modeling_kosmos2.py
deleted file mode 100644
index b648b6ac4..000000000
--- a/tests/transformers/models/kosmos2/test_modeling_kosmos2.py
+++ /dev/null
@@ -1,675 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore KOSMOS-2 model."""
-
-import copy
-import inspect
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import requests
-from mindspore import ops
-from mindnlp.transformers import AutoModelForVision2Seq, AutoProcessor, Kosmos2Config
-from mindnlp.transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import Kosmos2ForConditionalGeneration, Kosmos2Model
-
-
-if is_vision_available():
-    from PIL import Image
-
-
-class Kosmos2VisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=32,
-        patch_size=4,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=1e-10,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return Kosmos2VisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-class Kosmos2TextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[batch_idx, :int(start_index)] = 1
-                input_mask[batch_idx, int(start_index):] = 0
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return Kosmos2TextConfig(
-            vocab_size=self.vocab_size,
-            embed_dim=self.hidden_size,
-            layers=self.num_hidden_layers,
-            attention_heads=self.num_attention_heads,
-            ffn_dim=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-class Kosmos2ModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, latent_query_num=3, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = Kosmos2TextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = Kosmos2VisionModelTester(parent, **vision_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.latent_query_num = latent_query_num
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-
-        # build `image_embeds_position_mask`
-        image_embeds_position_mask = ops.zeros_like(input_ids)
-        image_embeds_position_mask[:, 1 : 1 + self.latent_query_num :] = 1
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, image_embeds_position_mask, pixel_values
-
-    def get_config(self):
-        return Kosmos2Config(
-            self.text_model_tester.get_config().to_dict(),
-            self.vision_model_tester.get_config().to_dict(),
-            latent_query_num=self.latent_query_num,
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, image_embeds_position_mask, pixel_values):
-        model = Kosmos2Model(config).set_train(False)
-        result = model(pixel_values, input_ids, image_embeds_position_mask, attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.text_model_tester.batch_size, self.text_model_tester.seq_length, self.text_model_tester.hidden_size),
-        )
-        self.parent.assertEqual(
-            result.image_embeds.shape,
-            (self.text_model_tester.batch_size, self.latent_query_num, self.text_model_tester.hidden_size),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, image_embeds_position_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "image_embeds_position_mask": image_embeds_position_mask,
-            "pixel_values": pixel_values,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class Kosmos2ModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (Kosmos2Model, Kosmos2ForConditionalGeneration) if is_mindspore_available() else ()
-    all_generative_model_classes = (Kosmos2ForConditionalGeneration,) if is_mindspore_available() else ()
-    # pipeline_model_mapping = (
-    #     {"feature-extraction": Kosmos2Model, "image-to-text": Kosmos2ForConditionalGeneration}
-    #     if is_mindspore_available()
-    #     else {}
-    # )
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    # TODO: `image-to-text` pipeline for this model needs Processor.
-    # def is_pipeline_test_to_skip(
-    #     self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    # ):
-    #     return pipeline_test_casse_name == "ImageToTextPipelineTests"
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if return_labels:
-            if model_class.__name__ == "Kosmos2ForConditionalGeneration":
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.text_model_tester.batch_size, self.model_tester.text_model_tester.seq_length),
-                    dtype=mindspore.int64,
-                )
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = Kosmos2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Kosmos2Config, hidden_size=37)
-
-    # overwrite from common to skip `image_to_text_projection.latent_query`
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    if name == "image_to_text_projection.latent_query":
-                        # The original code use ` nn.Parameter(torch.randn(...))` for which this test won't pass.
-                        continue
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_load_save_without_tied_weights(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        config.text_config.tie_word_embeddings = False
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            with tempfile.TemporaryDirectory() as d:
-                model.save_pretrained(d)
-
-                model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True)
-                # Checking the state dicts are correct
-                reloaded_state = model_reloaded.parameters_dict()
-                for k, v in model.parameters_dict().items():
-                    self.assertIn(k, reloaded_state, f"Key {k} is missing from reloaded")
-                    np.testing.assert_allclose(
-                    #torch.testing.assert_close(
-                        v.asnumpy(), reloaded_state[k].asnumpy(), err_msg=lambda x: f"{model_class.__name__}: Tensor {k}: {x}"
-                    )
-                # Checking there was no complain of missing weights
-                self.assertEqual(infos["missing_keys"], [])
-
-    # overwrite from common in order to use `self.model_tester.text_model_tester.num_hidden_layers`
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester,
-                "expected_num_hidden_layers",
-                self.model_tester.text_model_tester.num_hidden_layers + 1,
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            seq_length = self.model_tester.text_model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.text_model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    # overwrite from common in order to use `config.text_config.vocab_size` instead of `config.vocab_size`
-    def test_tie_model_weights(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_same_values(layer_1, layer_2):
-            equal = True
-            for p1, p2 in zip(layer_1.weight, layer_2.weight):
-                if p1.data.ne(p2.data).sum() > 0:
-                    equal = False
-            return equal
-
-        for model_class in self.all_model_classes:
-            model_not_tied = model_class(config)
-            if model_not_tied.get_output_embeddings() is None:
-                continue
-
-            config_tied = copy.deepcopy(config)
-            model_tied = model_class(config_tied)
-            params_tied = list(model_tied.get_parameters())
-            # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # # Check that after modification, they remain the same.
-            # embeddings.weight.data.div_(2)
-            # # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # # Check that after modification, they remain the same.
-            # decoding.weight.data.div_(4)
-            # # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(embeddings.weight.shape, decoding.weight.shape)
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # Check that after resize they remain tied.
-            model_tied.resize_token_embeddings(config.text_config.vocab_size + 10)
-            params_tied_2 = list(model_tied.get_parameters())
-            self.assertEqual(len(params_tied_2), len(params_tied))
-
-            # decoding.weight.data.mul_(20)
-            # # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
-            # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/kosmos-2-patch14-224"
-        model = Kosmos2Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "https://hf-mirror.com/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg?download=true"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_vision
-@require_mindspore
-@slow
-class Kosmos2ModelIntegrationTest(unittest.TestCase):
-    def run_example(self, prompt, image, model, processor):
-        inputs = processor(text=prompt, images=image, return_tensors="ms", padding=True)
-
-        generation_outputs = model.generate(
-            pixel_values=inputs["pixel_values"],
-            input_ids=inputs["input_ids"],
-            attention_mask=inputs["attention_mask"],
-            image_embeds=None,
-            image_embeds_position_mask=inputs["image_embeds_position_mask"],
-            use_cache=True,
-            max_new_tokens=128,
-            output_scores=True,
-            return_dict_in_generate=True,
-        )
-
-        scores = generation_outputs.scores
-        generated_ids = generation_outputs.sequences
-        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
-        # Specify `cleanup_and_extract=False` in order to see the raw model generation.
-        processed_text = [processor.post_process_generation(x, cleanup_and_extract=False) for x in generated_text]
-        # By default, the generated  text is cleanup and the entities are extracted.
-        final_text_with_entities = [processor.post_process_generation(x) for x in generated_text]
-
-        return scores, generated_ids, generated_text, processed_text, final_text_with_entities
-
-    def test_snowman_image_captioning(self):
-        url = "https://hf-mirror.com/microsoft/kosmos-2-patch14-224/resolve/main/snowman.png?download=true"
-
-        image = Image.open(requests.get(url, stream=True).raw)
-        image.save("new_image.jpg")
-        image = Image.open("new_image.jpg")
-
-        model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
-        processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
-
-        prompt = "<grounding>An image of"
-        scores, generated_ids, generated_text, processed_text, final_text_with_entities = self.run_example(
-            prompt, image, model, processor
-        )
-        processed_text = processed_text[0]
-        final_text, entities = final_text_with_entities[0]
-
-        #atol = 1e-4 if IS_ROCM_SYSTEM else 1e-5
-        atol = 1e-5
-
-        np.testing.assert_allclose(
-            ops.concat(scores[1:4])[:3, :3].numpy(),
-            np.array(
-                [
-                    [-1.5672581195831299, -5.007406711578369, 4.36448860168457],
-                    [-2.147017002105713, -4.966302871704102, 4.592559337615967],
-                    [-0.9352350831031799, -4.688288688659668, 6.240612983703613],
-                ]
-            ),
-            atol=atol,
-        )
-        np.testing.assert_allclose(
-            ops.concat(scores[-3:])[-3:, -3:].numpy(),
-            np.array(
-                [
-                    [2.9916205406188965, 2.481820583343506, 4.646594524383545],
-                    [-2.8381078243255615, -2.9687185287475586, -2.6926779747009277],
-                    [-2.8909168243408203, -3.2228589057922363, -1.7056822776794434],
-                ]
-            ),
-            atol=1e-5,
-        )
-
-        # fmt: off
-        EXPECTED_IDS = [
-           [
-                0, 64003, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
-                29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 64004, 64012, 712, 1648, 9, 64007, 10, 43867, 64008,
-                64009, 64057, 64876, 64010, 5950, 597, 32, 64007, 10, 646, 64008, 64009, 64018, 64924, 64010, 4, 2
-           ]
-        ]
-        # fmt: on
-        self.assertListEqual(generated_ids.numpy().tolist(), EXPECTED_IDS)
-
-        EXPECTED_PROCESSED_TEXT = (
-            "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> "
-            "warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
-        )
-        self.assertEqual(processed_text, EXPECTED_PROCESSED_TEXT)
-
-        self.assertEqual(final_text, "An image of a snowman warming himself by a fire.")
-
-        EXPECTED_ENTITIES = [
-            ("a snowman", (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]),
-            ("a fire", (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)]),
-        ]
-        self.assertListEqual(entities, EXPECTED_ENTITIES)
-
-        # test with the detail caption generation
-
-        prompt = "<grounding>Describe this image in detail:"
-        scores, generated_ids, generated_text, processed_text, final_text_with_entities = self.run_example(
-            prompt, image, model, processor
-        )
-        processed_text = processed_text[0]
-        final_text, entities = final_text_with_entities[0]
-
-        np.testing.assert_allclose(
-            ops.concat(scores[1:4])[:3, :3].numpy(),
-            np.array(
-                [
-                    [-0.9093570113182068, -4.578373908996582, 5.96360969543457],
-                    [2.452126979827881, -4.090598106384277, 8.738677024841309],
-                    [-0.7624598741531372, -4.771658897399902, 6.576295852661133],
-                ]
-            ),
-            atol=atol,
-        )
-        np.testing.assert_allclose(
-            ops.concat(scores[-3:])[-3:, -3:].numpy(),
-            np.array(
-                [
-                    [-1.673659086227417, -2.162452220916748, -1.95430588722229],
-                    [-2.006824493408203, -2.2038745880126953, -1.24686861038208],
-                    [-3.2783470153808594, -2.814181089401245, -1.390632152557373],
-                ]
-            ),
-            atol=1e-5,
-        )
-
-        # fmt: off
-        EXPECTED_IDS_LONG = [
-            [
-                0, 64003, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
-                29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 64004, 64012, 34645, 247, 38, 1648, 12, 3391, 55,
-                24, 1648, 1338, 10, 43867, 1280, 32, 64007, 10, 30879, 64008, 64009, 64018, 65020, 64010, 12, 5, 1842,
-                4, 71, 17, 1679, 64007, 10, 3958, 64008, 64009, 64061, 64263, 64010, 6, 64007, 15719, 64008, 64009,
-                64253, 64617, 64010, 6, 8, 64007, 9626, 64008, 64009, 64413, 64545, 64010, 6, 23, 64007, 10, 4363,
-                64008, 64009, 64623, 64885, 64010, 2255, 8, 64007, 10, 3486, 64008, 64009, 64809, 65036, 64010, 1560,
-                2255, 4, 24, 43867, 1684, 7, 27, 3774, 5, 10356, 9, 5, 646, 6, 8, 22, 1684, 7, 30, 10, 2007, 8, 16239,
-                4337, 4, 2
-            ]
-        ]
-        # fmt: on
-        self.assertListEqual(generated_ids.numpy().tolist(), EXPECTED_IDS_LONG)
-
-        EXPECTED_PROCESSED_TEXT_LONG = (
-            "<grounding> Describe this image in detail: The image features a snowman sitting by<phrase> a campfire"
-            "</phrase><object><patch_index_0005><patch_index_1007></object> in the snow. He is wearing<phrase> a hat"
-            "</phrase><object><patch_index_0048><patch_index_0250></object>,<phrase> scarf</phrase><object>"
-            "<patch_index_0240><patch_index_0604></object>, and<phrase> gloves</phrase><object><patch_index_0400>"
-            "<patch_index_0532></object>, with<phrase> a pot</phrase><object><patch_index_0610><patch_index_0872>"
-            "</object> nearby and<phrase> a cup</phrase><object><patch_index_0796><patch_index_1023></object> placed "
-            "nearby. The snowman appears to be enjoying the warmth of the fire, and it appears to have a warm and cozy "
-            "atmosphere."
-        )
-        self.assertEqual(processed_text, EXPECTED_PROCESSED_TEXT_LONG)
-
-        EXPECTED_FINAL_TEXT_LONG = (
-            "Describe this image in detail: The image features a snowman sitting by a campfire in the snow. He is "
-            "wearing a hat, scarf, and gloves, with a pot nearby and a cup placed nearby. The snowman appears to be "
-            "enjoying the warmth of the fire, and it appears to have a warm and cozy atmosphere."
-        )
-        self.assertEqual(final_text, EXPECTED_FINAL_TEXT_LONG)
-
-        EXPECTED_ENTITIES_LONG = [
-            ("a campfire", (71, 81), [(0.171875, 0.015625, 0.484375, 0.984375)]),
-            ("a hat", (109, 114), [(0.515625, 0.046875, 0.828125, 0.234375)]),
-            ("scarf", (116, 121), [(0.515625, 0.234375, 0.890625, 0.578125)]),
-            ("gloves", (127, 133), [(0.515625, 0.390625, 0.640625, 0.515625)]),
-            ("a pot", (140, 145), [(0.078125, 0.609375, 0.265625, 0.859375)]),
-            ("a cup", (157, 162), [(0.890625, 0.765625, 0.984375, 0.984375)]),
-        ]
-        self.assertListEqual(entities, EXPECTED_ENTITIES_LONG)
-
-    def test_snowman_image_captioning_batch(self):
-        url = "https://hf-mirror.com/microsoft/kosmos-2-patch14-224/resolve/main/snowman.png?download=true"
-        image = Image.open(requests.get(url, stream=True).raw)
-        image.save("new_image.jpg")
-        image = Image.open("new_image.jpg")
-
-        model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
-
-        prompt = ["<grounding>Describe this image in detail:", "<grounding>An image of"]
-
-        # left padding
-        processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224", padding_side="left")
-
-        scores, generated_ids, generated_text, processed_text, final_text_with_entities = self.run_example(
-            prompt, [image] * len(prompt), model, processor
-        )
-        all_final_text = [x[0] for x in final_text_with_entities]
-        all_entities = [x[1] for x in final_text_with_entities]
-
-        # left padding gives identical results as non-padding
-        EXPECTED_PROCESSED_TEXT_0 = (
-            "<grounding> Describe this image in detail: The image features a snowman sitting by<phrase> a campfire"
-            "</phrase><object><patch_index_0005><patch_index_1007></object> in the snow. He is wearing<phrase> a hat"
-            "</phrase><object><patch_index_0048><patch_index_0250></object>,<phrase> scarf</phrase><object>"
-            "<patch_index_0240><patch_index_0604></object>, and<phrase> gloves</phrase><object><patch_index_0400>"
-            "<patch_index_0532></object>, with<phrase> a pot</phrase><object><patch_index_0610><patch_index_0872>"
-            "</object> nearby and<phrase> a cup</phrase><object><patch_index_0796><patch_index_1023></object> placed "
-            "nearby. The snowman appears to be enjoying the warmth of the fire, and it appears to have a warm and cozy "
-            "atmosphere."
-        )
-        EXPECTED_PROCESSED_TEXT_1 = (
-            "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> "
-            "warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
-        )
-        self.assertListEqual(processed_text, [EXPECTED_PROCESSED_TEXT_0, EXPECTED_PROCESSED_TEXT_1])
-
-        EXPECTED_FINAL_TEXT_0 = (
-            "Describe this image in detail: The image features a snowman sitting by a campfire in the snow. He is "
-            "wearing a hat, scarf, and gloves, with a pot nearby and a cup placed nearby. The snowman appears to be "
-            "enjoying the warmth of the fire, and it appears to have a warm and cozy atmosphere."
-        )
-        EXPECTED_FINAL_TEXT_1 = "An image of a snowman warming himself by a fire."
-        self.assertListEqual(all_final_text, [EXPECTED_FINAL_TEXT_0, EXPECTED_FINAL_TEXT_1])
-
-        EXPECTED_ENTITIES_0 = [
-            ("a campfire", (71, 81), [(0.171875, 0.015625, 0.484375, 0.984375)]),
-            ("a hat", (109, 114), [(0.515625, 0.046875, 0.828125, 0.234375)]),
-            ("scarf", (116, 121), [(0.515625, 0.234375, 0.890625, 0.578125)]),
-            ("gloves", (127, 133), [(0.515625, 0.390625, 0.640625, 0.515625)]),
-            ("a pot", (140, 145), [(0.078125, 0.609375, 0.265625, 0.859375)]),
-            ("a cup", (157, 162), [(0.890625, 0.765625, 0.984375, 0.984375)]),
-        ]
-        EXPECTED_ENTITIES_1 = [
-            ("a snowman", (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]),
-            ("a fire", (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)]),
-        ]
-        self.assertListEqual(all_entities, [EXPECTED_ENTITIES_0, EXPECTED_ENTITIES_1])
-
-        # right padding
-        processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
-
-        scores, generated_ids, generated_text, processed_text, final_text_with_entities = self.run_example(
-            prompt, [image] * len(prompt), model, processor
-        )
-        all_final_text = [x[0] for x in final_text_with_entities]
-        all_entities = [x[1] for x in final_text_with_entities]
-
-        # For right padding, only the non-padded sequences will give the same results as non-padding
-        self.assertEqual(processed_text[0], EXPECTED_PROCESSED_TEXT_0)
-        self.assertEqual(all_final_text[0], EXPECTED_FINAL_TEXT_0)
-        self.assertListEqual(all_entities[0], EXPECTED_ENTITIES_0)
diff --git a/tests/transformers/models/kosmos2/test_processor_kosmos2.py b/tests/transformers/models/kosmos2/test_processor_kosmos2.py
deleted file mode 100644
index 6d17c17a5..000000000
--- a/tests/transformers/models/kosmos2/test_processor_kosmos2.py
+++ /dev/null
@@ -1,481 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import shutil
-import tempfile
-import unittest
-from tempfile import TemporaryDirectory
-
-import numpy as np
-import pytest
-import requests
-
-from mindnlp.utils.testing_utils import (
-    get_tests_dir,
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    require_vision,
-)
-from mindnlp.utils import is_vision_available
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import (
-        AutoProcessor,
-        CLIPImageProcessor,
-        Kosmos2Processor,
-        PreTrainedTokenizerFast,
-        XLMRobertaTokenizer,
-        XLMRobertaTokenizerFast,
-    )
-
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
-
-
-@require_sentencepiece
-@require_tokenizers
-@require_vision
-class Kosmos2ProcessorTest(unittest.TestCase):
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-
-        image_processor = CLIPImageProcessor()
-
-        # We have a SentencePiece fixture for testing
-        slow_tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB)
-        fast_tokenizer = XLMRobertaTokenizerFast(__slow_tokenizer=slow_tokenizer)
-
-        processor = Kosmos2Processor(image_processor, fast_tokenizer)
-        processor.save_pretrained(self.tmpdirname)
-
-    def get_tokenizer(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
-
-    def get_image_processor(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def prepare_image_inputs(self):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
-
-        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        return image_inputs
-
-    def test_image_procesor_load_save_reload(self):
-        # make sure load from Hub repo. -> save -> reload locally work
-        image_processor = CLIPImageProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
-        with TemporaryDirectory() as tmp_dir:
-            image_processor.save_pretrained(tmp_dir)
-            reloaded_image_processor = CLIPImageProcessor.from_pretrained(tmp_dir)
-            assert image_processor.to_dict() == reloaded_image_processor.to_dict()
-            assert image_processor.to_json_string() == reloaded_image_processor.to_json_string()
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = Kosmos2Processor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
-
-        processor = Kosmos2Processor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
-
-        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, CLIPImageProcessor)
-
-    def test_image_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Kosmos2Processor(tokenizer=tokenizer, image_processor=image_processor)
-
-        image_input = self.prepare_image_inputs()
-
-        input_image_processor = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
-
-        for key in input_image_processor.keys():
-            self.assertAlmostEqual(input_image_processor[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_tokenizer(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Kosmos2Processor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "This is a test"
-
-        encoded_processor = processor(text=input_str, add_eos_token=True)
-
-        encoded_tok = tokenizer(input_str, return_token_type_ids=False)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Kosmos2Processor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "This is a test"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertListEqual(
-            list(inputs.keys()), ["pixel_values", "input_ids", "attention_mask", "image_embeds_position_mask"]
-        )
-
-        # test if it raises when no input is passed
-        with pytest.raises(ValueError):
-            processor()
-
-    def test_tokenizer_decode(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Kosmos2Processor(tokenizer=tokenizer, image_processor=image_processor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_model_input_names(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Kosmos2Processor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "This is a test"
-        image_input = self.prepare_image_inputs()
-
-        # both image and text
-        inputs = processor(text=input_str, images=image_input)
-        self.assertListEqual(
-            list(inputs.keys()), ["pixel_values", "input_ids", "attention_mask", "image_embeds_position_mask"]
-        )
-
-        # only text
-        inputs = processor(text=input_str)
-        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask"])
-
-        # only image
-        inputs = processor(images=image_input)
-        self.assertListEqual(list(inputs.keys()), ["pixel_values"])
-
-    @require_mindspore
-    def test_full_processor(self):
-        url = "https://hf-mirror.com/microsoft/kosmos-2-patch14-224/resolve/main/two_dogs.jpg?download=true"
-
-        processor = Kosmos2Processor.from_pretrained("microsoft/kosmos-2-patch14-224")
-
-        # test with different input formats.
-        # fmt: off
-        texts = [
-            # no phrase
-            "<grounding> Two puppies sit in a field of grass.",
-            # 1 phrase
-            "<grounding> <phrase> Two puppies </phrase> sit in a field of grass.",
-            # 2 phrases
-            "<grounding> <phrase> Two puppies </phrase> sit in a field of <phrase> grass </phrase>.",
-            # 2 phrases:  bboxes already specified for the 1st phrase
-            "<grounding> <phrase> Two puppies </phrase> <object> <patch_index_0079> <patch_index_1016> </delimiter_of_multi_objects/> <patch_index_0135> <patch_index_1008> </object> sit in a field of <phrase> grass </phrase>.",
-        ]
-        # fmt: on
-
-        image = Image.open(requests.get(url, stream=True).raw)
-        # To match the official (microsoft) Kosmos-2 demo from which the expected values here are grabbed
-        image_path = os.path.join(self.tmpdirname, "image.jpg")
-        image.save(image_path)
-        image = Image.open(image_path)
-
-        # fmt: off
-        bboxes = [
-            [None, []],
-            [[None], [[]], [(79, 1016)], [[(79, 1016)]], [[(79, 1016), (135, 1008)]]],
-            [[[(79, 1016), (135, 1008)], None], [[(79, 1016), (135, 1008)], []], [[(79, 1016), (135, 1008)], (480, 1023)], [[(79, 1016), (135, 1008)], [(480, 1023)]]],
-            [[None, [(480, 1023)]]],
-        ]
-        # fmt: on
-
-        batch_image = [image] * 4
-        batch_text = [texts[0], texts[1], texts[1], texts[2]]
-        batch_bboxes = [
-            None,  # no phrase
-            [[]],  # 1 phrase: no bbox
-            [(79, 1016)],  # 1 phrase: 1 bbox
-            [[(79, 1016), (135, 1008)], (480, 1023)],  # 2 phrase: 2 bboxes + 1 bbox
-        ]
-
-        # fmt: off
-        expected_input_ids = [
-            [0, 64012, 1264, 17772, 1357, 12, 10, 770, 9, 4464, 4, 2],
-            [0, 64012, 64007, 1264, 17772, 64008, 1357, 12, 10, 770, 9, 4464, 4, 2],
-            [0, 64012, 64007, 1264, 17772, 64008, 64009, 64092, 65029, 64010, 1357, 12, 10, 770, 9, 4464, 4, 2],
-            [0, 64012, 64007, 1264, 17772, 64008, 64009, 64092, 65029, 64011, 64148, 65021, 64010, 1357, 12, 10, 770, 9, 4464, 4, 2],
-            [0, 64012, 64007, 1264, 17772, 64008, 64009, 64092, 65029, 64011, 64148, 65021, 64010, 1357, 12, 10, 770, 9, 64007, 4464, 64008, 106, 4, 2],
-            [0, 64012, 64007, 1264, 17772, 64008, 64009, 64092, 65029, 64011, 64148, 65021, 64010, 1357, 12, 10, 770, 9, 64007, 4464, 64008, 64009, 64493, 65036, 64010, 106, 4, 2],
-        ]
-        # fmt: on
-
-        EXPECTED_PIXEL_VALUES_1 = np.array(
-            [
-                [
-                    [-0.6535852551460266, -0.6389868259429932, -0.6243883967399597],
-                    [-0.6535852551460266, -0.6389868259429932, -0.6243883967399597],
-                    [-0.6243883967399597, -0.6243883967399597, -0.5951915383338928],
-                ],
-                [
-                    [-0.20629698038101196, -0.19128920137882233, -0.19128920137882233],
-                    [-0.20629698038101196, -0.19128920137882233, -0.17628143727779388],
-                    [-0.2213047444820404, -0.20629698038101196, -0.16127367317676544],
-                ],
-                [
-                    [-0.5843556523323059, -0.5701355338096619, -0.5701355338096619],
-                    [-0.5843556523323059, -0.5701355338096619, -0.5559154152870178],
-                    [-0.5843556523323059, -0.5559154152870178, -0.5416953563690186],
-                ],
-            ]
-        )
-        EXPECTED_PIXEL_VALUES_2 = np.array(
-            [
-                [
-                    [-0.4346088469028473, -0.47840413451194763, -0.7849710583686829],
-                    [-0.5221993923187256, -0.5076009631156921, -0.755774199962616],
-                    [-0.5221993923187256, -0.5076009631156921, -0.7411757707595825],
-                ],
-                [
-                    [-0.2813358008861542, -0.2963435649871826, -0.431413471698761],
-                    [-0.26632803678512573, -0.2963435649871826, -0.4764367938041687],
-                    [-0.2213047444820404, -0.2813358008861542, -0.49144455790519714],
-                ],
-                [
-                    [-0.5701355338096619, -0.641235888004303, -0.7549964189529419],
-                    [-0.5843556523323059, -0.641235888004303, -0.7834365367889404],
-                    [-0.5559154152870178, -0.641235888004303, -0.7834365367889404],
-                ],
-            ]
-        )
-
-        def check(texts, bboxes, expected_input_ids):
-            outputs = processor(images=None, text=texts, bboxes=bboxes, add_eos_token=True)
-            self.assertListEqual(outputs.input_ids, expected_input_ids)
-
-        # no phrase
-        check(texts[0], bboxes[0][0], expected_input_ids[0])
-
-        # no phrase
-        check(texts[0], bboxes[0][1], expected_input_ids[0])
-
-        # 1 phrase: no bbox
-        check(texts[1], bboxes[1][0], expected_input_ids[1])
-
-        # 1 phrase: no bbox
-        check(texts[1], bboxes[1][1], expected_input_ids[1])
-
-        # 1 phrase: 1 bbox
-        check(texts[1], bboxes[1][2], expected_input_ids[2])
-
-        # 1 phrase: 1 bbox
-        check(texts[1], bboxes[1][3], expected_input_ids[2])
-
-        # 1 phrase: 2 bboxes
-        check(texts[1], bboxes[1][4], expected_input_ids[3])
-
-        # could not contain `[None]`
-        with pytest.raises(ValueError):
-            _ = processor.preprocess_examples(images=None, texts=texts[1], bboxes=[[None]])
-
-        # 2 phrase: 2 bboxes + no bbox
-        check(texts[2], bboxes[2][0], expected_input_ids[4])
-
-        # 2 phrase: 2 bboxes + no bbox
-        check(texts[2], bboxes[2][1], expected_input_ids[4])
-
-        # 2 phrase: 2 bboxes + 1 bbox
-        check(texts[2], bboxes[2][2], expected_input_ids[5])
-
-        # 2 phrase: 2 bboxes + 1 bbox
-        check(texts[2], bboxes[2][3], expected_input_ids[5])
-
-        # 2 phrase: no box (as already specified in the text) + 1 bbox
-        check(texts[3], bboxes[3][0], expected_input_ids[5])
-
-        # could not contain `[None]`
-        with pytest.raises(ValueError):
-            _ = processor.preprocess_examples(images=None, texts=texts[2], bboxes=[[(79, 1016), (135, 1008)], [None]])
-
-        # test batch
-        outputs = processor(
-            images=None,
-            text=batch_text,
-            bboxes=batch_bboxes,
-            add_eos_token=True,
-        )
-        self.assertListEqual(
-            outputs.input_ids,
-            [expected_input_ids[0], expected_input_ids[1], expected_input_ids[2], expected_input_ids[5]],
-        )
-
-        # test batch with padding (without `return_tensors`)
-        outputs = processor(
-            images=None,
-            text=batch_text,
-            bboxes=batch_bboxes,
-            padding=True,
-            add_eos_token=True,
-        )
-        # padding on the right
-        self.assertListEqual(
-            outputs.input_ids[0],
-            expected_input_ids[0] + [1] * (len(expected_input_ids[5]) - len(expected_input_ids[0])),
-        )
-        self.assertListEqual(
-            outputs.attention_mask[0],
-            [1] * len(expected_input_ids[0]) + [0] * (len(expected_input_ids[5]) - len(expected_input_ids[0])),
-        )
-        # no padding for the longest sequence
-        self.assertListEqual(outputs.input_ids[-1], expected_input_ids[5])
-        self.assertListEqual(outputs.attention_mask[-1], [1] * len(expected_input_ids[5]))
-
-        # test batch with padding (with `return_tensors`)
-        outputs = processor(
-            images=None,
-            text=batch_text,
-            bboxes=batch_bboxes,
-            return_tensors="ms",
-            padding=True,
-            add_eos_token=True,
-        )
-        # padding on the right
-        self.assertListEqual(
-            outputs.input_ids.numpy().tolist()[0],
-            expected_input_ids[0] + [1] * (len(expected_input_ids[5]) - len(expected_input_ids[0])),
-        )
-        self.assertListEqual(
-            outputs.attention_mask.numpy().tolist()[0],
-            [1] * len(expected_input_ids[0]) + [0] * (len(expected_input_ids[5]) - len(expected_input_ids[0])),
-        )
-        # no padding for the longest sequence
-        self.assertListEqual(outputs.input_ids.numpy().tolist()[-1], expected_input_ids[5])
-        self.assertListEqual(outputs.attention_mask.numpy().tolist()[-1], [1] * len(expected_input_ids[5]))
-
-        # test with image
-        num_image_tokens = 64
-
-        outputs = processor(images=image, text=texts[0], bboxes=None, add_eos_token=True)
-        self.assertTupleEqual(outputs.pixel_values[0].shape, (3, 224, 224))
-        self.assertListEqual(
-            outputs.input_ids,
-            [0, 64003] + list(range(4, 4 + num_image_tokens)) + [64004] + expected_input_ids[0][1:],
-        )
-        self.assertListEqual(
-            outputs.image_embeds_position_mask,
-            [0] * 2 + [1] * num_image_tokens + [0] + [0] * (len(expected_input_ids[0]) - 1),
-        )
-        np.testing.assert_allclose(outputs.pixel_values[0][:3, :3, :3], EXPECTED_PIXEL_VALUES_1, atol=1e-9)
-        np.testing.assert_allclose(outputs.pixel_values[0][:3, -3:, -3:], EXPECTED_PIXEL_VALUES_2, atol=1e-9)
-
-        # test with image in batch (right padding)
-        outputs = processor(
-            images=batch_image,
-            text=batch_text,
-            bboxes=batch_bboxes,
-            return_tensors="ms",
-            padding=True,
-            add_eos_token=True,
-        )
-        self.assertTupleEqual(outputs.pixel_values.shape, (4, 3, 224, 224))
-        np.testing.assert_allclose(
-            outputs.pixel_values[:, :3, :3, :3].numpy(), [EXPECTED_PIXEL_VALUES_1] * len(batch_image), atol=1e-9
-        )
-        np.testing.assert_allclose(
-            outputs.pixel_values[:, :3, -3:, -3:].numpy(), [EXPECTED_PIXEL_VALUES_2] * len(batch_image), atol=1e-9
-        )
-        # padding on the right: the `[1:]` below is because the part for `BOS` is already added in the beginning of each (dynamically computed) expected value  # noqa
-        # fmt: off
-        EXPECTED_IDS_BATCH_RIGHT_PADDING = [
-            [0, 64003] + list(range(4, 4 + num_image_tokens)) + [64004] + expected_input_ids[0][1:] + [1] * (len(expected_input_ids[5]) - len(expected_input_ids[0])),
-            [0, 64003] + list(range(4, 4 + num_image_tokens)) + [64004] + expected_input_ids[5][1:],
-        ]
-        EXPECTED_MASK_BATCH_RIGHT_PADDING = [
-            [1, 1] + [1] * num_image_tokens + [1] + [1] * len(expected_input_ids[0][1:]) + [0] * (len(expected_input_ids[5]) - len(expected_input_ids[0])),
-            [1] * (2 + num_image_tokens + len(expected_input_ids[5])),
-        ]
-        # fmt: on
-        self.assertListEqual(outputs.input_ids.numpy().tolist()[0], EXPECTED_IDS_BATCH_RIGHT_PADDING[0])
-        self.assertListEqual(outputs.attention_mask.numpy().tolist()[0], EXPECTED_MASK_BATCH_RIGHT_PADDING[0])
-        self.assertListEqual(outputs.input_ids.numpy().tolist()[-1], EXPECTED_IDS_BATCH_RIGHT_PADDING[-1])
-        self.assertListEqual(outputs.attention_mask.numpy().tolist()[-1], EXPECTED_MASK_BATCH_RIGHT_PADDING[-1])
-        self.assertListEqual(
-            outputs.image_embeds_position_mask.numpy().tolist(),
-            [[0, 0] + [1] * num_image_tokens + [0] + [0] * (len(expected_input_ids[5]) - 1)] * len(batch_image),
-        )
-
-        processor = Kosmos2Processor.from_pretrained("microsoft/kosmos-2-patch14-224", padding_side="left")
-
-        # test with image in batch (left padding)
-        outputs = processor(
-            images=batch_image,
-            text=batch_text,
-            bboxes=batch_bboxes,
-            return_tensors="ms",
-            padding=True,
-            add_eos_token=True,
-        )
-        # padding on the left: the `[1:]` below is because the part for `BOS` is already added in the beginning of each (dynamically computed) expected value  # noqa
-        # fmt: off
-        EXPECTED_IDS_BATCH = [
-            [1] * (len(expected_input_ids[5]) - len(expected_input_ids[0])) + [0, 64003] + list(range(4, 4 + num_image_tokens)) + [64004] + expected_input_ids[0][1:],
-            [0, 64003] + list(range(4, 4 + num_image_tokens)) + [64004] + expected_input_ids[5][1:],
-        ]
-        EXPECTED_MASK_BATCH =[
-            [0] * (len(expected_input_ids[5]) - len(expected_input_ids[0])) + [1, 1] + [1] * num_image_tokens + [1] + [1] * len(expected_input_ids[0][1:]),
-            [1] * (2 + num_image_tokens + len(expected_input_ids[5])),
-        ]
-        EXPECTED_IMG_POS_MASK_BATCH = [
-            [0] * (len(expected_input_ids[5]) - len(expected_input_ids[0])) + [0, 0] + [1] * num_image_tokens + [0] + [0] * len(expected_input_ids[0][1:]),
-            [0, 0] + [1] * num_image_tokens + [0] + [0] * (len(expected_input_ids[5]) - 1),
-        ]
-        # fmt: on
-
-        self.assertListEqual(outputs.input_ids.numpy().tolist()[0], EXPECTED_IDS_BATCH[0])
-        self.assertListEqual(outputs.attention_mask.numpy().tolist()[0], EXPECTED_MASK_BATCH[0])
-        self.assertListEqual(outputs.image_embeds_position_mask.numpy().tolist()[0], EXPECTED_IMG_POS_MASK_BATCH[0])
-
-        # no padding for the longest sequence
-        self.assertListEqual(outputs.input_ids.numpy().tolist()[-1], EXPECTED_IDS_BATCH[-1])
-        self.assertListEqual(outputs.attention_mask.numpy().tolist()[-1], EXPECTED_MASK_BATCH[-1])
-        self.assertListEqual(outputs.image_embeds_position_mask.numpy().tolist()[-1], EXPECTED_IMG_POS_MASK_BATCH[-1])
diff --git a/tests/transformers/models/layoutlm/__init__.py b/tests/transformers/models/layoutlm/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/layoutlm/test_modeling_layoutlm.py b/tests/transformers/models/layoutlm/test_modeling_layoutlm.py
deleted file mode 100644
index 633f43490..000000000
--- a/tests/transformers/models/layoutlm/test_modeling_layoutlm.py
+++ /dev/null
@@ -1,394 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors, The Hugging Face Team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-from mindnlp.transformers import LayoutLMConfig, is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        LayoutLMForMaskedLM,
-        LayoutLMForQuestionAnswering,
-        LayoutLMForSequenceClassification,
-        LayoutLMForTokenClassification,
-        LayoutLMModel,
-    )
-
-
-class LayoutLMModelTester:
-    """You can also import this e.g from .test_modeling_layoutlm import LayoutLMModelTester"""
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        range_bbox=1000,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.range_bbox = range_bbox
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox)
-        # Ensure that bbox is legal
-        for i in range(bbox.shape[0]):
-            for j in range(bbox.shape[1]):
-                if bbox[i, j, 3] < bbox[i, j, 1]:
-                    t = bbox[i, j, 3]
-                    bbox[i, j, 3] = bbox[i, j, 1]
-                    bbox[i, j, 1] = t
-                if bbox[i, j, 2] < bbox[i, j, 0]:
-                    t = bbox[i, j, 2]
-                    bbox[i, j, 2] = bbox[i, j, 0]
-                    bbox[i, j, 0] = t
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return LayoutLMConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = LayoutLMModel(config=config)
-        model.eval()
-        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, bbox, token_type_ids=token_type_ids)
-        result = model(input_ids, bbox)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = LayoutLMForMaskedLM(config=config)
-        model.eval()
-        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = LayoutLMForSequenceClassification(config)
-        model.eval()
-        result = model(
-            input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = LayoutLMForTokenClassification(config=config)
-        model.eval()
-        result = model(input_ids, bbox, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = LayoutLMForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            bbox=bbox,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            bbox,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "bbox": bbox,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class LayoutLMModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            LayoutLMModel,
-            LayoutLMForMaskedLM,
-            LayoutLMForSequenceClassification,
-            LayoutLMForTokenClassification,
-            LayoutLMForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else None
-    )
-    pipeline_model_mapping = (
-        {
-            "document-question-answering": LayoutLMForQuestionAnswering,
-            "feature-extraction": LayoutLMModel,
-            "fill-mask": LayoutLMForMaskedLM,
-            "text-classification": LayoutLMForSequenceClassification,
-            "token-classification": LayoutLMForTokenClassification,
-            "zero-shot": LayoutLMForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = True
-
-    def setUp(self):
-        self.model_tester = LayoutLMModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LayoutLMConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-
-def prepare_layoutlm_batch_inputs():
-    # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on:
-    # fmt: off
-    input_ids = mindspore.tensor([[101,1019,1014,1016,1037,12849,4747,1004,14246,2278,5439,4524,5002,2930,2193,2930,4341,3208,1005,1055,2171,2848,11300,3531,102],[101,4070,4034,7020,1024,3058,1015,1013,2861,1013,6070,19274,2772,6205,27814,16147,16147,4343,2047,10283,10969,14389,1012,2338,102]])  # noqa: E231
-    attention_mask = mindspore.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],])  # noqa: E231
-    bbox = mindspore.tensor([[[0,0,0,0],[423,237,440,251],[427,272,441,287],[419,115,437,129],[961,885,992,912],[256,38,330,58],[256,38,330,58],[336,42,353,57],[360,39,401,56],[360,39,401,56],[411,39,471,59],[479,41,528,59],[533,39,630,60],[67,113,134,131],[141,115,209,132],[68,149,133,166],[141,149,187,164],[195,148,287,165],[195,148,287,165],[195,148,287,165],[295,148,349,165],[441,149,492,166],[497,149,546,164],[64,201,125,218],[1000,1000,1000,1000]],[[0,0,0,0],[662,150,754,166],[665,199,742,211],[519,213,554,228],[519,213,554,228],[134,433,187,454],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[314,469,376,482],[504,684,582,706],[941,825,973,900],[941,825,973,900],[941,825,973,900],[941,825,973,900],[610,749,652,765],[130,659,168,672],[176,657,237,672],[238,657,312,672],[443,653,628,672],[443,653,628,672],[716,301,825,317],[1000,1000,1000,1000]]])  # noqa: E231
-    token_type_ids = mindspore.tensor([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])  # noqa: E231
-    # these are sequence labels (i.e. at the token level)
-    labels = mindspore.tensor([[-100,10,10,10,9,1,-100,7,7,-100,7,7,4,2,5,2,8,8,-100,-100,5,0,3,2,-100],[-100,12,12,12,-100,12,10,-100,-100,-100,-100,10,12,9,-100,-100,-100,10,10,10,9,12,-100,10,-100]])  # noqa: E231
-    # fmt: on
-
-    return input_ids, attention_mask, bbox, token_type_ids, labels
-
-
-@require_mindspore
-class LayoutLMModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_forward_pass_no_head(self):
-        model = LayoutLMModel.from_pretrained("microsoft/layoutlm-base-uncased")
-
-        input_ids, attention_mask, bbox, token_type_ids, labels = prepare_layoutlm_batch_inputs()
-
-        # forward pass
-        outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
-
-        # test the sequence output on [0, :3, :3]
-        expected_slice = mindspore.tensor(
-            [[0.1785, -0.1947, -0.0425], [-0.3254, -0.2807, 0.2553], [-0.5391, -0.3322, 0.3364]],
-        )
-
-        self.assertTrue(ops.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-3))
-
-        # test the pooled output on [1, :3]
-        expected_slice = mindspore.tensor([-0.6580, -0.0214, 0.8552])
-
-        self.assertTrue(ops.allclose(outputs.pooler_output[1, :3], expected_slice, atol=1e-3))
-
-    @slow
-    def test_forward_pass_sequence_classification(self):
-        # initialize model with randomly initialized sequence classification head
-        model = LayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=2)
-        input_ids, attention_mask, bbox, token_type_ids, _ = prepare_layoutlm_batch_inputs()
-
-        # forward pass
-        outputs = model(
-            input_ids=input_ids,
-            bbox=bbox,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            labels=mindspore.tensor([1, 1]),
-        )
-
-        # test whether we get a loss as a scalar
-        loss = outputs.loss
-        expected_shape = ()
-        self.assertEqual(loss.shape, expected_shape)
-
-        # test the shape of the logits
-        logits = outputs.logits
-        expected_shape = (2, 2)
-        self.assertEqual(logits.shape, expected_shape)
-
-    @slow
-    def test_forward_pass_token_classification(self):
-        # initialize model with randomly initialized token classification head
-        model = LayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=13)
-        input_ids, attention_mask, bbox, token_type_ids, labels = prepare_layoutlm_batch_inputs()
-
-        # forward pass
-        outputs = model(
-            input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels
-        )
-
-        # test the loss calculation to be around 2.65
-        # expected_loss = mindspore.tensor(2.65)
-
-        # The loss is currently somewhat random and can vary between 0.1-0.3 atol.
-        # self.assertTrue(ops.allclose(outputs.loss, expected_loss, atol=0.1))
-
-        # test the shape of the logits
-        logits = outputs.logits
-        expected_shape = (2, 25, 13)
-        self.assertEqual(logits.shape, expected_shape)
-
-    @slow
-    def test_forward_pass_question_answering(self):
-        # initialize model with randomly initialized token classification head
-        model = LayoutLMForQuestionAnswering.from_pretrained("microsoft/layoutlm-base-uncased")
-
-        input_ids, attention_mask, bbox, token_type_ids, labels = prepare_layoutlm_batch_inputs()
-
-        # forward pass
-        outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
-
-        # test the shape of the logits
-        expected_shape = (2, 25)
-        self.assertEqual(outputs.start_logits.shape, expected_shape)
-        self.assertEqual(outputs.end_logits.shape, expected_shape)
\ No newline at end of file
diff --git a/tests/transformers/models/layoutlmv2/__init__.py b/tests/transformers/models/layoutlmv2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/transformers/models/layoutlmv2/test_modeling_layoutlmv2.py
deleted file mode 100644
index 67db40444..000000000
--- a/tests/transformers/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ /dev/null
@@ -1,572 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore LayoutLMv2 model."""
-
-import unittest
-from typing import List, Tuple
-from mindnlp.utils.testing_utils import require_mindspore, slow
-from mindnlp.utils import is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-    import mindnlp.core.nn.functional as F
-
-    from mindnlp.transformers import (
-        LayoutLMv2Config,
-        LayoutLMv2ForQuestionAnswering,
-        LayoutLMv2ForSequenceClassification,
-        LayoutLMv2ForTokenClassification,
-        LayoutLMv2Model,
-    )
-
-
-class ImageList:
-    """
-    Structure that holds a list of images (of possibly
-    varying sizes) as a single tensor.
-    This works by padding the images to the same size.
-    The original sizes of each image is stored in `image_sizes`.
-
-    Attributes:
-        image_sizes (list[tuple[int, int]]): each tuple is (h, w).
-            During tracing, it becomes list[Tensor] instead.
-    """
-
-    def __init__(self, tensor: mindspore.Tensor, image_sizes: List[Tuple[int, int]]):
-        """
-        Arguments:
-            tensor (Tensor): of shape (N, H, W) or (N, C_1, ..., C_K, H, W) where K >= 1
-            image_sizes (list[tuple[int, int]]): Each tuple is (h, w). It can
-                be smaller than (H, W) due to padding.
-        """
-        self.tensor = tensor
-        self.image_sizes = image_sizes
-
-    def __len__(self) -> int:
-        return len(self.image_sizes)
-
-    def __getitem__(self, idx) -> mindspore.Tensor:
-        """
-        Access the individual image in its original size.
-
-        Args:
-            idx: int or slice
-
-        Returns:
-            Tensor: an image of shape (H, W) or (C_1, ..., C_K, H, W) where K >= 1
-        """
-        size = self.image_sizes[idx]
-        return self.tensor[idx, ..., : size[0], : size[1]]
-
-class LayoutLMv2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        num_channels=3,
-        image_size=4,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=36,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        image_feature_pool_shape=[7, 7, 256],
-        coordinate_size=6,
-        shape_size=6,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        range_bbox=1000,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.image_feature_pool_shape = image_feature_pool_shape
-        self.coordinate_size = coordinate_size
-        self.shape_size = shape_size
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.range_bbox = range_bbox
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox)
-        # Ensure that bbox is legal
-        for i in range(bbox.shape[0]):
-            for j in range(bbox.shape[1]):
-                if bbox[i, j, 3] < bbox[i, j, 1]:
-                    t = bbox[i, j, 3]
-                    bbox[i, j, 3] = bbox[i, j, 1]
-                    bbox[i, j, 1] = t
-                if bbox[i, j, 2] < bbox[i, j, 0]:
-                    t = bbox[i, j, 2]
-                    bbox[i, j, 2] = bbox[i, j, 0]
-                    bbox[i, j, 0] = t
-
-        image = ImageList(
-            ops.zeros(self.batch_size, self.num_channels, self.image_size, self.image_size),
-            self.image_size,
-        )
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-
-        config = LayoutLMv2Config(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            image_feature_pool_shape=self.image_feature_pool_shape,
-            coordinate_size=self.coordinate_size,
-            shape_size=self.shape_size,
-        )
-
-        # use smaller resnet backbone to make tests faster
-        config.detectron2_config_args["MODEL.RESNETS.DEPTH"] = 18
-        config.detectron2_config_args["MODEL.RESNETS.RES2_OUT_CHANNELS"] = 64
-        config.detectron2_config_args["MODEL.RESNETS.NUM_GROUPS"] = 1
-
-        return config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
-
-    def create_and_check_model(
-        self, config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
-    ):
-        model = LayoutLMv2Model(config=config)
-        model.eval()
-
-        result = model(input_ids, bbox=bbox, image=image, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, bbox=bbox, image=image, token_type_ids=token_type_ids)
-        result = model(input_ids, bbox=bbox, image=image)
-
-        # LayoutLMv2 has a different expected sequence length, namely also visual tokens are added
-        expected_seq_len = self.seq_length + self.image_feature_pool_shape[0] * self.image_feature_pool_shape[1]
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
-    ):
-        config.num_labels = self.num_labels
-        model = LayoutLMv2ForSequenceClassification(config)
-        model.eval()
-        result = model(
-            input_ids,
-            bbox=bbox,
-            image=image,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
-    ):
-        config.num_labels = self.num_labels
-        model = LayoutLMv2ForTokenClassification(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            bbox=bbox,
-            image=image,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels
-    ):
-        model = LayoutLMv2ForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            bbox=bbox,
-            image=image,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            bbox,
-            image,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "bbox": bbox,
-            "image": image,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class LayoutLMv2ModelTest(ModelTesterMixin, unittest.TestCase):
-    test_pruning = False
-    test_torchscript = True
-    test_mismatched_shapes = False
-
-    all_model_classes = (
-        (
-            LayoutLMv2Model,
-            LayoutLMv2ForSequenceClassification,
-            LayoutLMv2ForTokenClassification,
-            LayoutLMv2ForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"document-question-answering": LayoutLMv2ForQuestionAnswering, "feature-extraction": LayoutLMv2Model}
-        if is_mindspore_available()
-        else {}
-    )
-
-    def setUp(self):
-        self.model_tester = LayoutLMv2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LayoutLMv2Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        # LayoutLMv2 has a different expected sequence length
-        expected_seq_len = (
-            self.model_tester.seq_length
-            + self.model_tester.image_feature_pool_shape[0] * self.model_tester.image_feature_pool_shape[1]
-        )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, expected_seq_len, expected_seq_len],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, expected_seq_len, expected_seq_len],
-            )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            # LayoutLMv2 has a different expected sequence length
-            expected_seq_len = (
-                self.model_tester.seq_length
-                + self.model_tester.image_feature_pool_shape[0] * self.model_tester.image_feature_pool_shape[1]
-            )
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [expected_seq_len, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    @unittest.skip(reason="We cannot configure detectron2 to output a smaller backbone")
-    def test_model_is_small(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/layoutlmv2-base-uncased"
-        model = LayoutLMv2Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "backbone" in name or "visual_segment_embedding" in name:
-                    continue
-
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    def test_batching_equivalence(self):
-        def equivalence(tensor1, tensor2):
-            return 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=0)
-
-        def recursive_check(batched_object, single_row_object, model_name, key):
-            if isinstance(batched_object, (list, tuple)):
-                for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
-                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
-            elif batched_object is None:
-                return
-            else:
-                batched_row = batched_object[:1]
-                self.assertFalse(
-                    ops.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
-                )
-                self.assertTrue(
-                    (equivalence(batched_row, single_row_object)) <= 1e-03,
-                    msg=(
-                        f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
-                        f"Difference={equivalence(batched_row, single_row_object)}."
-                    ),
-                )
-
-        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            config.output_hidden_states = True
-
-            model_name = model_class.__name__
-            batched_input_prepared = self._prepare_for_class(batched_input, model_class)
-            model = model_class(config).eval()
-            batch_size = self.model_tester.batch_size
-
-            single_row_input = {}
-            for key, value in batched_input_prepared.items():
-                if isinstance(value, mindspore.Tensor) and value.shape[0] % batch_size == 0:
-                    single_batch_shape = value.shape[0] // batch_size
-                    single_row_input[key] = value[:single_batch_shape]
-                elif hasattr(value, "tensor"):
-                    # layoutlmv2uses ImageList intead of pixel values (needs for torchscript)
-                    single_row_input[key] = value.tensor[:single_batch_shape]
-
-            with no_grad():
-                model_batched_output = model(**batched_input_prepared)
-                model_row_output = model(**single_row_input)
-
-            for key in model_batched_output:
-                recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
-
-
-def prepare_layoutlmv2_batch_inputs():
-    # Here we prepare a batch of 2 sequences to test a LayoutLMv2 forward pass on:
-    # fmt: off
-    input_ids = mindspore.tensor([[101,1019,1014,1016,1037,12849,4747,1004,14246,2278,5439,4524,5002,2930,2193,2930,4341,3208,1005,1055,2171,2848,11300,3531,102],[101,4070,4034,7020,1024,3058,1015,1013,2861,1013,6070,19274,2772,6205,27814,16147,16147,4343,2047,10283,10969,14389,1012,2338,102]])  # noqa: E231
-    bbox = mindspore.tensor([[[0,0,0,0],[423,237,440,251],[427,272,441,287],[419,115,437,129],[961,885,992,912],[256,38,330,58],[256,38,330,58],[336,42,353,57],[360,39,401,56],[360,39,401,56],[411,39,471,59],[479,41,528,59],[533,39,630,60],[67,113,134,131],[141,115,209,132],[68,149,133,166],[141,149,187,164],[195,148,287,165],[195,148,287,165],[195,148,287,165],[295,148,349,165],[441,149,492,166],[497,149,546,164],[64,201,125,218],[1000,1000,1000,1000]],[[0,0,0,0],[662,150,754,166],[665,199,742,211],[519,213,554,228],[519,213,554,228],[134,433,187,454],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[130,467,204,480],[314,469,376,482],[504,684,582,706],[941,825,973,900],[941,825,973,900],[941,825,973,900],[941,825,973,900],[610,749,652,765],[130,659,168,672],[176,657,237,672],[238,657,312,672],[443,653,628,672],[443,653,628,672],[716,301,825,317],[1000,1000,1000,1000]]])  # noqa: E231
-    image = ImageList(ops.randn((2,3,224,224)), image_sizes=[(224,224), (224,224)])  # noqa: E231
-    attention_mask = mindspore.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],])  # noqa: E231
-    token_type_ids = mindspore.tensor([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])  # noqa: E231
-    # fmt: on
-
-    return input_ids, bbox, image, attention_mask, token_type_ids
-
-
-@require_mindspore
-class LayoutLMv2ModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head(self):
-        model = LayoutLMv2Model.from_pretrained("microsoft/layoutlmv2-base-uncased")
-
-        (
-            input_ids,
-            bbox,
-            image,
-            attention_mask,
-            token_type_ids,
-        ) = prepare_layoutlmv2_batch_inputs()
-
-        # forward pass
-        outputs = model(
-            input_ids=input_ids,
-            bbox=bbox,
-            image=image,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-        )
-
-        # verify the sequence output
-        expected_shape = (
-                2,
-                input_ids.shape[1]
-                + model.config.image_feature_pool_shape[0] * model.config.image_feature_pool_shape[1],
-                model.config.hidden_size,
-            )
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[-0.1087, 0.0727, -0.3075], [0.0799, -0.0427, -0.0751], [-0.0367, 0.0480, -0.1358]]
-        )
-        self.assertTrue(ops.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-3))
-
-        # verify the pooled output
-        expected_shape = (2, model.config.hidden_size)
-        self.assertEqual(outputs.pooler_output.shape, expected_shape)
\ No newline at end of file
diff --git a/tests/transformers/models/layoutlmv3/__init__.py b/tests/transformers/models/layoutlmv3/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/transformers/models/layoutlmv3/test_modeling_layoutlmv3.py
deleted file mode 100644
index 6128b3cf8..000000000
--- a/tests/transformers/models/layoutlmv3/test_modeling_layoutlmv3.py
+++ /dev/null
@@ -1,408 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch LayoutLMv3 model."""
-
-import copy
-import unittest
-
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import require_mindspore, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, nn
-
-    from mindnlp.transformers import (
-        MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
-        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-        LayoutLMv3Config,
-        LayoutLMv3ForQuestionAnswering,
-        LayoutLMv3ForSequenceClassification,
-        LayoutLMv3ForTokenClassification,
-        LayoutLMv3Model,
-    )
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import LayoutLMv3ImageProcessor
-
-
-class LayoutLMv3ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        num_channels=3,
-        image_size=4,
-        patch_size=2,
-        text_seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=36,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        coordinate_size=6,
-        shape_size=6,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        range_bbox=1000,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.text_seq_length = text_seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.coordinate_size = coordinate_size
-        self.shape_size = shape_size
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.range_bbox = range_bbox
-
-        # LayoutLMv3's sequence length equals the number of text tokens + number of patches + 1 (we add 1 for the CLS token)
-        self.text_seq_length = text_seq_length
-        self.image_seq_length = (image_size // patch_size) ** 2 + 1
-        self.seq_length = self.text_seq_length + self.image_seq_length
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.text_seq_length], self.vocab_size)
-
-        bbox = ids_tensor([self.batch_size, self.text_seq_length, 4], self.range_bbox)
-        # Ensure that bbox is legal
-        for i in range(bbox.shape[0]):
-            for j in range(bbox.shape[1]):
-                if bbox[i, j, 3] < bbox[i, j, 1]:
-                    t = bbox[i, j, 3]
-                    bbox[i, j, 3] = bbox[i, j, 1]
-                    bbox[i, j, 1] = t
-                if bbox[i, j, 2] < bbox[i, j, 0]:
-                    t = bbox[i, j, 2]
-                    bbox[i, j, 2] = bbox[i, j, 0]
-                    bbox[i, j, 0] = t
-
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.text_seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.text_seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.text_seq_length], self.num_labels)
-
-        config = LayoutLMv3Config(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            coordinate_size=self.coordinate_size,
-            shape_size=self.shape_size,
-            input_size=self.image_size,
-            patch_size=self.patch_size,
-        )
-
-        return config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels, token_labels
-
-    def create_and_check_model(
-        self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels, token_labels
-    ):
-        model = LayoutLMv3Model(config=config)
-        model.eval()
-
-        # text + image
-        result = model(input_ids, pixel_values=pixel_values)
-        result = model(
-            input_ids, bbox=bbox, pixel_values=pixel_values, attention_mask=input_mask, token_type_ids=token_type_ids
-        )
-        result = model(input_ids, bbox=bbox, pixel_values=pixel_values, token_type_ids=token_type_ids)
-        result = model(input_ids, bbox=bbox, pixel_values=pixel_values)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-        # text only
-        result = model(input_ids)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.text_seq_length, self.hidden_size)
-        )
-
-        # image only
-        result = model(pixel_values=pixel_values)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.image_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels, token_labels
-    ):
-        config.num_labels = self.num_labels
-        model = LayoutLMv3ForSequenceClassification(config)
-        model.eval()
-        result = model(
-            input_ids,
-            bbox=bbox,
-            pixel_values=pixel_values,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels, token_labels
-    ):
-        config.num_labels = self.num_labels
-        model = LayoutLMv3ForTokenClassification(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            bbox=bbox,
-            pixel_values=pixel_values,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.text_seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, bbox, pixel_values, token_type_ids, input_mask, sequence_labels, token_labels
-    ):
-        model = LayoutLMv3ForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            bbox=bbox,
-            pixel_values=pixel_values,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            bbox,
-            pixel_values,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "bbox": bbox,
-            "pixel_values": pixel_values,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class LayoutLMv3ModelTest(ModelTesterMixin, unittest.TestCase):
-    test_pruning = False
-    test_torchscript = False
-    test_mismatched_shapes = False
-
-    all_model_classes = (
-        (
-            LayoutLMv3Model,
-            LayoutLMv3ForSequenceClassification,
-            LayoutLMv3ForTokenClassification,
-            LayoutLMv3ForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"document-question-answering": LayoutLMv3ForQuestionAnswering, "feature-extraction": LayoutLMv3Model}
-        if is_mindspore_available()
-        else {}
-    )
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        # `DocumentQuestionAnsweringPipeline` is expected to work with this model, but it combines the text and visual
-        # embedding along the sequence dimension (dim 1), which causes an error during post-processing as `p_mask` has
-        # the sequence dimension of the text embedding only.
-        # (see the line `embedding_output = ops.cat([embedding_output, visual_embeddings], dim=1)`)
-        return True
-
-    def setUp(self):
-        self.model_tester = LayoutLMv3ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LayoutLMv3Config, hidden_size=37)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-        if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-            inputs_dict = {
-                k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1)
-                if isinstance(v, mindspore.Tensor) and v.ndim > 1
-                else v
-                for k, v in inputs_dict.items()
-            }
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs_dict["labels"] = ops.ones(self.model_tester.batch_size, dtype=mindspore.int64)
-            elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                inputs_dict["start_positions"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-                inputs_dict["end_positions"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-            elif model_class in [
-                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
-            ]:
-                inputs_dict["labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-            elif model_class in [
-                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
-            ]:
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.text_seq_length),
-                    dtype=mindspore.int64
-                )
-        return inputs_dict
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/layoutlmv3-base"
-        model = LayoutLMv3Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-class LayoutLMv3ModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return LayoutLMv3ImageProcessor(apply_ocr=False) if is_vision_available() else None
-
-    @slow
-    def test_inference_no_head(self):
-        model = LayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        pixel_values = image_processor(images=image, return_tensors="ms").pixel_values
-
-        input_ids = mindspore.tensor([[1, 2]])
-        bbox = mindspore.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]).unsqueeze(0)
-
-        # forward pass
-        outputs = model(
-            input_ids=input_ids,
-            bbox=bbox,
-            pixel_values=pixel_values,
-        )
-
-        # verify the logits
-        expected_shape = (1, 199, 768)
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[-0.0529, 0.3618, 0.1632], [-0.1587, -0.1667, -0.0400], [-0.1557, -0.1671, -0.0505]]
-        )
-        print(outputs.last_hidden_state[0, :3, :3])
-        self.assertTrue(ops.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/transformers/models/led/__init__.py b/tests/transformers/models/led/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/led/test_modeling_led.py b/tests/transformers/models/led/test_modeling_led.py
deleted file mode 100644
index 1252dccf7..000000000
--- a/tests/transformers/models/led/test_modeling_led.py
+++ /dev/null
@@ -1,599 +0,0 @@
-# coding=utf-8
-# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore LED model."""
-
-import copy
-import tempfile
-import unittest
-
-from mindnlp.transformers import LEDConfig, is_mindspore_available
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-from mindnlp.utils import cached_property
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import (
-        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-        LEDForConditionalGeneration,
-        LEDForQuestionAnswering,
-        LEDForSequenceClassification,
-        LEDModel,
-        LEDTokenizer,
-    )
-    from mindnlp.transformers.models.led.modeling_led import LEDDecoder, LEDEncoder
-
-def prepare_led_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(config.encoder_layers, config.encoder_attention_heads)
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": decoder_attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-class LEDModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=11,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=32,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        attention_window=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.attention_window = attention_window
-
-        # `ModelTesterMixin.test_attention_outputs` is expecting attention tensors to be of size
-        # [num_attention_heads, encoder_seq_length, encoder_key_length], but LongformerSelfAttention
-        # returns attention of shape [num_attention_heads, encoder_seq_length, self.attention_window + 1]
-        # because its local attention only attends to `self.attention_window + 1` locations
-        # (assuming no token with global attention, otherwise the last dimension of attentions
-        # is x + self.attention_window + 1, where x is the number of tokens with global attention)
-        # x is set to 1
-        self.encoder_key_length = self.attention_window + 2
-
-        # because of padding `encoder_seq_length`, is different from `seq_length`. Relevant for
-        # the `test_attention_outputs` and `test_hidden_states_output` tests
-        self.encoder_seq_length = self.seq_length
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-        inputs_dict = prepare_led_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def get_config(self):
-        return LEDConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            attention_window=self.attention_window,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.max_position_embeddings = 100
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        global_attention_mask = ops.zeros_like(inputs_dict["input_ids"])
-        global_attention_mask[:, -1] = 1
-        inputs_dict["global_attention_mask"] = global_attention_mask
-
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = LEDModel(config=config).get_decoder().eval()
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-        head_mask = inputs_dict["head_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask.astype(attention_mask.dtype)], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = LEDModel(config=config).eval()
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = LEDEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(
-            inputs_dict["input_ids"],
-            attention_mask=inputs_dict["attention_mask"],
-            global_attention_mask=inputs_dict["global_attention_mask"],
-        )[0]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = LEDDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-    def check_global_attention(self, config, inputs_dict):
-        model = LEDModel(config=config).eval()
-        model.config.output_attentions = True
-        attention_mask = ids_tensor(inputs_dict["input_ids"].shape, vocab_size=2)
-        global_attention_mask = ops.zeros_like(attention_mask)
-
-        # set some tokens to global_attention
-        num_tokens_with_global_attention = 2
-
-        attention_mask[:, 2 : 2 + num_tokens_with_global_attention] = 1
-        global_attention_mask[:, 2 : 2 + num_tokens_with_global_attention] = 1
-        inputs_dict["attention_mask"] = attention_mask
-        inputs_dict["global_attention_mask"] = global_attention_mask
-
-        outputs = model(**inputs_dict)
-        self.parent.assertIsNotNone(outputs.encoder_global_attentions)
-
-        # setting `num_tokens_with_global_attention` to global_attentions yields
-        # makes last dim to be of `num_tokens_with_global_attention`
-        self.parent.assertTrue(
-            outputs.encoder_global_attentions[0].shape,
-            (self.batch_size, self.num_attention_heads, self.encoder_seq_length, num_tokens_with_global_attention),
-        )
-
-
-@require_mindspore
-class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (LEDModel, LEDForConditionalGeneration, LEDForSequenceClassification, LEDForQuestionAnswering)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (LEDForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": LEDModel,
-            "question-answering": LEDForQuestionAnswering,
-            "summarization": LEDForConditionalGeneration,
-            "text-classification": LEDForSequenceClassification,
-            "text2text-generation": LEDForConditionalGeneration,
-            "translation": LEDForConditionalGeneration,
-            "zero-shot": LEDForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_pruning = False
-    test_missing_keys = False
-    test_torchscript = False
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = LEDModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LEDConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    def test_global_attention(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_global_attention(*config_and_inputs)
-
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, input_ids, attention_mask, inputs_dict = GenerationTesterMixin._get_input_ids_and_config(
-            self, batch_size=batch_size
-        )
-        # LED computes attention scores based on mask indices if `is_global`
-        inputs_dict.pop("global_attention_mask")
-        return config, input_ids, attention_mask, inputs_dict
-
-    # LEDForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (LEDModel, LEDForConditionalGeneration, LEDForQuestionAnswering):
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with no_grad():
-                model(**inputs)[0]
-
-    @require_mindspore
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = LEDForConditionalGeneration(config).eval()
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    @unittest.skip(reason="Longformer cannot keep gradients in attentions or hidden states")
-    def test_retain_grad_hidden_states_attentions(self):
-        return
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_length = self.model_tester.seq_length
-        encoder_seq_length = self.model_tester.encoder_seq_length
-        encoder_key_length = self.model_tester.encoder_key_length
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            # global attention outputs are added as well => so +1 here
-            correct_outlen = 6
-
-            # loss is at first position
-            if "labels" in inputs_dict:
-                correct_outlen += 1  # loss is added to beginning
-            # Question Answering model returns start_logits and end_logits
-            if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                correct_outlen += 1  # start_logits and end_logits instead of only 1 output
-            if "past_key_values" in outputs:
-                correct_outlen += 1  # past_key_values have been returned
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_length, seq_length],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    seq_length,
-                    seq_length,
-                ],
-            )
-
-    def _check_encoder_attention_for_generate(self, attentions, batch_size, config, seq_length):
-        # overwrite because LED does not have (bs, num_heads, seq_len, seq_len) shape
-        encoder_expected_shape = (
-            batch_size,
-            config.num_attention_heads,
-            seq_length,
-            self.model_tester.attention_window // 2 * 2 + 1,
-        )
-        self.assertIsInstance(attentions, tuple)
-        self.assertListEqual(
-            [layer_attentions.shape for layer_attentions in attentions],
-            [encoder_expected_shape] * len(attentions),
-        )
-
-
-def assert_tensors_close(a, b, atol=1e-12, prefix=""):
-    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if ops.allclose(a, b, atol=atol):
-            return True
-        raise
-    except Exception:
-        pct_different = (ops.gt((a - b).abs(), atol)).float().mean().item()
-        if a.numel() > 100:
-            msg = f"tensor values are {pct_different:.1%} percent different."
-        else:
-            msg = f"{a} != {b}"
-        if prefix:
-            msg = prefix + ": " + msg
-        raise AssertionError(msg)
-
-
-def _long_tensor(tok_lst):
-    return mindspore.tensor(tok_lst, dtype=mindspore.int64)
-
-
-TOLERANCE = 1e-4
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-@slow
-class LEDModelIntegrationTests(unittest.TestCase):
-    """All the below results were obtained with the original checkpoints and code
-    base from https://github.com/allenai/longformer.
-    IMPORTANT: Note that the original checkpoints include a `postion_embeddings` "hack"
-    and have to be cut to have the correct shape.
-    See: https://github.com/huggingface/transformers/pull/9278#issue-544709661.
-    """
-
-    @cached_property
-    def default_tokenizer(self):
-        return LEDTokenizer.from_pretrained("allenai/led-base-16384")
-
-    def test_inference_no_head(self):
-        model = LEDModel.from_pretrained("allenai/led-base-16384")
-
-        # change to intended input
-        input_ids = _long_tensor([512 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
-        decoder_input_ids = _long_tensor([128 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
-        inputs_dict = prepare_led_inputs_dict(model.config, input_ids, decoder_input_ids)
-        with no_grad():
-            output = model(**inputs_dict).last_hidden_state
-        expected_shape = (1, 1024, 768)
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = mindspore.tensor(
-            [[2.3050, 2.8279, 0.6531], [-1.8457, -0.1455, -3.5661], [-1.0186, 0.4586, -2.2043]]
-        )
-        self.assertTrue(ops.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
-
-    def test_inference_head(self):
-        model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")
-
-        # change to intended input
-        input_ids = _long_tensor([512 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
-        decoder_input_ids = _long_tensor([128 * [0, 31414, 232, 328, 740, 1140, 12695, 69]])
-        inputs_dict = prepare_led_inputs_dict(model.config, input_ids, decoder_input_ids)
-        with no_grad():
-            output = model(**inputs_dict, use_cache=False).logits
-        expected_shape = (1, 1024, model.config.vocab_size)
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = mindspore.tensor(
-            [[33.6507, 6.4572, 16.8089], [5.8739, -2.4238, 11.2902], [-3.2139, -4.3149, 4.2783]]
-        )
-        self.assertTrue(ops.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
-
-    def test_seq_to_seq_generation(self):
-        # this test requires 16GB of RAM
-        hf = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv")
-        tok = LEDTokenizer.from_pretrained("allenai/led-large-16384-arxiv")
-
-        ARTICLE_LEP = r"""the lep experiments at the resonance of @xmath1-boson have tested the standard model ( sm ) at quantum level , measuring the @xmath1-decay into fermion pairs with an accuracy of one part in ten thousands . the good agreement of the lep data with the sm predictions have severely constrained the behavior of new physics at the @xmath1-pole . taking these achievements into account one can imagine that the physics of @xmath1-boson will again play the central role in the frontier of particle physics if the next generation @xmath1 factory comes true with the generated @xmath1 events several orders of magnitude higher than that of the lep . this factory can be realized in the gigaz option of the international linear collider ( ilc)@xcite . the ilc is a proposed electron - positron collider with tunable energy ranging from @xmath12 to @xmath13 and polarized beams in its first phase , and the gigaz option corresponds to its operation on top of the resonance of @xmath1 boson by adding a bypass to its main beam line . given the high luminosity , @xmath14 , and the cross section at the resonance of @xmath1 boson , @xmath15 , about @xmath16 @xmath1 events can be generated in an operational year of @xmath17 of gigaz , which implies that the expected sensitivity to the branching ratio of @xmath1-decay can be improved from @xmath18 at the lep to @xmath19 at the gigaz@xcite . in light of this , the @xmath1-boson properties , especially its exotic or rare decays which are widely believed to be sensitive to new physics , should be investigated comprehensively to evaluate their potential in probing new physics .    among the rare @xmath1-decays , the flavor changing ( fc ) processes were most extensively studied to explore the flavor texture in new physics @xcite , and it was found that , although these processes are severely suppressed in the sm , their branching ratios in new physics models can be greatly enhanced to @xmath19 for lepton flavor violation decays @xcite and @xmath20 for quark flavor violation decays @xcite . besides the fc processes , the @xmath1-decay into light higgs boson(s ) is another type of rare process that was widely studied , e.g. the decay @xmath21 ( @xmath22 ) with the particle @xmath0 denoting a light higgs boson was studied in @xcite , the decay @xmath23 was studied in the two higgs doublet model ( 2hdm)@xcite and the minimal supersymmetric standard model ( mssm)@xcite , and the decay @xmath4 was studied in a model independent way @xcite , in 2hdm@xcite and also in mssm@xcite . these studies indicate that , in contrast with the kinematic forbidden of these decays in the sm , the rates of these decays can be as large as @xmath18 in new physics models , which lie within the expected sensitivity of the gigaz . in this work , we extend the previous studies of these decays to some new models and investigate these decays altogether . we are motivated by some recent studies on the singlet extension of the mssm , such as the next - to - minimal supersymmetric standard model ( nmssm ) @xcite and the nearly minimal supersymmetric standard model ( nmssm ) @xcite , where a light cp - odd higgs boson @xmath0 with singlet - dominant component may naturally arise from the spontaneous breaking of some approximate global symmetry like @xmath24 or peccei - quuin symmetry @xcite . these non - minimal supersymmetric models can not only avoid the @xmath25-problem , but also alleviate the little hierarchy by having such a light higgs boson @xmath0 @xcite . we are also motivated by that , with the latest experiments , the properties of the light higgs boson are more stringently constrained than before . so it is worth updating the previous studies . so far there is no model - independent lower bound on the lightest higgs boson mass . in the sm , it must be heavier than @xmath26 gev , obtained from the null observation of the higgs boson at lep experiments . however , due to the more complex structure of the higgs sector in the extensions of the sm , this lower bound can be significantly relaxed according to recent studies , e.g. , for the cp - odd higgs boson @xmath0 we have @xmath27 gev in the nmssm @xcite , @xmath28 gev in the nmssm @xcite , and @xmath29 gev in the lepton - specific 2hdm ( l2hdm ) @xcite . with such a light cp - odd higgs boson , the z - decay into one or more @xmath0 is open up . noting that the decay @xmath30 is forbidden due to bose symmetry , we in this work study the rare @xmath1-decays @xmath6 ( @xmath22 ) , @xmath31 and @xmath4 in a comparative way for four models , namely the type - ii 2hdm@xcite , the l2hdm @xcite , the nmssm and the nmssm . in our study , we examine carefully the constraints on the light @xmath0 from many latest experimental results . this work is organized as follows . in sec . ii we briefly describe the four new physics models . in sec . iii we present the calculations of the rare @xmath1-decays . in sec . iv we list the constraints on the four new physics models . in sec . v we show the numerical results for the branching ratios of the rare @xmath1-decays in various models . finally , the conclusion is given in sec . as the most economical way , the sm utilizes one higgs doublet to break the electroweak symmetry . as a result , the sm predicts only one physical higgs boson with its properties totally determined by two free parameters . in new physics models , the higgs sector is usually extended by adding higgs doublets and/or singlets , and consequently , more physical higgs bosons are predicted along with more free parameters involved in . the general 2hdm contains two @xmath32 doublet higgs fields @xmath33 and @xmath34 , and with the assumption of cp - conserving , its scalar potential can be parameterized as@xcite : @xmath35,\end{aligned}\ ] ] where @xmath36 ( @xmath37 ) are free dimensionless parameters , and @xmath38 ( @xmath39 ) are the parameters with mass dimension . after the electroweak symmetry breaking , the spectrum of this higgs sector includes three massless goldstone modes , which become the longitudinal modes of @xmath40 and @xmath1 bosons , and five massive physical states : two cp - even higgs bosons @xmath41 and @xmath42 , one neutral cp - odd higgs particle @xmath0 and a pair of charged higgs bosons @xmath43 . noting the constraint @xmath44 with @xmath45 and @xmath46 denoting the vacuum expectation values ( vev ) of @xmath33 and @xmath34 respectively , we choose @xmath47 as the input parameters with @xmath48 , and @xmath49 being the mixing angle that diagonalizes the mass matrix of the cp - even higgs fields . the difference between the type - ii 2hdm and the l2hdm comes from the yukawa coupling of the higgs bosons to quark / lepton . in the type - ii 2hdm , one higgs doublet @xmath34 generates the masses of up - type quarks and the other doublet @xmath33 generates the masses of down - type quarks and charged leptons ; while in the l2hdm one higgs doublet @xmath33 couples only to leptons and the other doublet @xmath34 couples only to quarks . so the yukawa interactions of @xmath0 to fermions in these two models are given by @xcite @xmath50 with @xmath51 denoting generation index . obviously , in the type - ii 2hdm the @xmath52 coupling and the @xmath53 coupling can be simultaneously enhanced by @xmath54 , while in the l2hdm only the @xmath53 coupling is enhanced by @xmath55 . the structures of the nmssm and the nmssm are described by their superpotentials and corresponding soft - breaking terms , which are given by @xcite @xmath56 where @xmath57 is the superpotential of the mssm without the @xmath25 term , @xmath58 and @xmath59 are higgs doublet and singlet superfields with @xmath60 and @xmath61 being their scalar component respectively , @xmath62 , @xmath63 , @xmath64 , @xmath65 , @xmath66 and @xmath67 are soft breaking parameters , and @xmath68 and @xmath69 are coefficients of the higgs self interactions .    with the superpotentials and the soft - breaking terms , one can get the higgs potentials of the nmssm and the nmssm respectively . like the 2hdm , the higgs bosons with same cp property will mix and the mass eigenstates are obtained by diagonalizing the corresponding mass matrices : @xmath70 where the fields on the right hands of the equations are component fields of @xmath71 , @xmath72 and @xmath61 defined by @xmath73 @xmath74 and @xmath75 are respectively the cp - even and cp - odd neutral higgs bosons , @xmath76 and @xmath77 are goldstone bosons eaten by @xmath1 and @xmath78 , and @xmath79 is the charged higgs boson . so both the nmssm and nmssm predict three cp - even higgs bosons , two cp - odd higgs bosons and one pair of charged higgs bosons . in general , the lighter cp - odd higgs @xmath0 in these model is the mixture of the singlet field @xmath80 and the doublet field combination , @xmath81 , i.e. @xmath82 and its couplings to down - type quarks are then proportional to @xmath83 . so for singlet dominated @xmath0 , @xmath84 is small and the couplings are suppressed . as a comparison , the interactions of @xmath0 with the squarks are given by@xcite @xmath85 i.e. the interaction does not vanish when @xmath86 approaches zero . just like the 2hdm where we use the vevs of the higgs fields as fundamental parameters , we choose @xmath68 , @xmath69 , @xmath87 , @xmath88 , @xmath66 and @xmath89 as input parameters for the nmssm@xcite and @xmath68 , @xmath54 , @xmath88 , @xmath65 , @xmath90 and @xmath91 as input parameters for the nmssm@xcite . about the nmssm and the nmssm , three points should be noted . the first is for the two models , there is no explicit @xmath92term , and the effective @xmath25 parameter ( @xmath93 ) is generated when the scalar component of @xmath59 develops a vev . the second is , the nmssm is actually same as the nmssm with @xmath94@xcite , because the tadpole terms @xmath95 and its soft breaking term @xmath96 in the nmssm do not induce any interactions , except for the tree - level higgs boson masses and the minimization conditions . and the last is despite of the similarities , the nmssm has its own peculiarity , which comes from its neutralino sector . in the basis @xmath97 , its neutralino mass matrix is given by @xcite @xmath98 where @xmath99 and @xmath100 are @xmath101 and @xmath102 gaugino masses respectively , @xmath103 , @xmath104 , @xmath105 and @xmath106 . after diagonalizing this matrix one can get the mass eigenstate of the lightest neutralino @xmath107 with mass taking the following form @xcite @xmath108 this expression implies that @xmath107 must be lighter than about @xmath109 gev for @xmath110 ( from lower bound on chargnio mass ) and @xmath111 ( perturbativity bound ) . like the other supersymmetric models , @xmath107 as the lightest sparticle acts as the dark matter in the universe , but due to its singlino - dominated nature , it is difficult to annihilate sufficiently to get the correct density in the current universe . so the relic density of @xmath107 plays a crucial way in selecting the model parameters . for example , as shown in @xcite , for @xmath112 , there is no way to get the correct relic density , and for the other cases , @xmath107 mainly annihilates by exchanging @xmath1 boson for @xmath113 , or by exchanging a light cp - odd higgs boson @xmath0 with mass satisfying the relation @xmath114 for @xmath115 . for the annihilation , @xmath54 and @xmath25 are required to be less than 10 and @xmath116 respectively because through eq.([mass - exp ] ) a large @xmath87 or @xmath25 will suppress @xmath117 to make the annihilation more difficult . the properties of the lightest cp - odd higgs boson @xmath0 , such as its mass and couplings , are also limited tightly since @xmath0 plays an important role in @xmath107 annihilation . the phenomenology of the nmssm is also rather special , and this was discussed in detail in @xcite . in the type - ii 2hdm , l2hdm , nmssm and nmssm , the rare @xmath1-decays @xmath118 ( @xmath22 ) , @xmath3 and @xmath4 may proceed by the feynman diagrams shown in fig.[fig1 ] , fig.[fig2 ] and fig.[fig3 ] respectively . for these diagrams , the intermediate state @xmath119 represents all possible cp - even higgs bosons in the corresponding model , i.e. @xmath41 and @xmath42 in type - ii 2hdm and l2hdm and @xmath41 , @xmath42 and @xmath120 in nmssm and nmssm . in order to take into account the possible resonance effects of @xmath119 in fig.[fig1](c ) for @xmath2 and fig.[fig3 ] ( a ) for @xmath11 , we have calculated all the decay modes of @xmath119 and properly included the width effect in its propagator . as to the decay @xmath121 , two points should be noted . one is , unlike the decays @xmath6 and @xmath11 , this process proceeds only through loops mediated by quarks / leptons in the type - ii 2hdm and l2hdm , and additionally by sparticles in the nmssm and nmssm . so in most cases its rate should be much smaller than the other two . the other is due to cp - invariance , loops mediated by squarks / sleptons give no contribution to the decay@xcite . in actual calculation , this is reflected by the fact that the coupling coefficient of @xmath122 differs from that of @xmath123 by a minus sign ( see eq.([asqsq ] ) ) , and as a result , the squark - mediated contributions to @xmath121 are completely canceled out .    with regard to the rare decay @xmath11 , we have more explanations . in the lowest order , this decay proceeds by the diagram shown in fig.[fig3 ] ( a ) , and hence one may think that , as a rough estimate , it is enough to only consider the contributions from fig.[fig3](a ) . however , we note that in some cases of the type - ii 2hdm and l2hdm , due to the cancelation of the contributions from different @xmath119 in fig.[fig3 ] ( a ) and also due to the potentially largeness of @xmath124 couplings ( i.e. larger than the electroweak scale @xmath125 ) , the radiative correction from the higgs - mediated loops may dominate over the tree level contribution even when the tree level prediction of the rate , @xmath126 , exceeds @xmath20 . on the other hand , we find the contribution from quark / lepton - mediated loops can be safely neglected if @xmath127 in the type - ii 2hdm and the l2hdm . in the nmssm and the nmssm , besides the corrections from the higgs- and quark / lepton - mediated loops , loops involving sparticles such as squarks , charginos and neutralinos can also contribute to the decay . we numerically checked that the contributions from squarks and charginos can be safely neglected if @xmath127 . we also calculated part of potentially large neutralino correction ( note that there are totally about @xmath128 diagrams for such correction ! ) and found they can be neglected too . since considering all the radiative corrections will make our numerical calculation rather slow , we only include the most important correction , namely that from higgs - mediated loops , in presenting our results for the four models . one can intuitively understand the relative smallness of the sparticle contribution to @xmath11 as follows . first consider the squark contribution which is induced by the @xmath129 interaction ( @xmath130 denotes the squark in chirality state ) and the @xmath131 interaction through box diagrams . because the @xmath132 interaction conserves the chirality of the squarks while the @xmath133 interaction violates the chirality , to get non - zero contribution to @xmath11 from the squark loops , at least four chiral flippings are needed , with three of them provided by @xmath131 interaction and the rest provided by the left - right squark mixing . this means that , if one calculates the amplitude in the chirality basis with the mass insertion method , the amplitude is suppressed by the mixing factor @xmath134 with @xmath135 being the off diagonal element in squark mass matrix . next consider the chargino / neutralino contributions . since for a light @xmath0 , its doublet component , parameterized by @xmath84 in eq.([mixing ] ) , is usually small , the couplings of @xmath0 with the sparticles will never be tremendously large@xcite . so the chargino / neutralino contributions are not important too . in our calculation of the decays , we work in the mass eigenstates of sparticles instead of in the chirality basis . for the type - ii 2hdm and the l2hdm , we consider the following constraints @xcite :    * theoretical constraints on @xmath136 from perturbativity , unitarity and requirements that the scalar potential is finit at large field values and contains no flat directions @xcite , which imply that @xmath137 * the constraints from the lep search for neutral higgs bosons . we compute the signals from the higgs - strahlung production @xmath138 ( @xmath139 ) with @xmath140 @xcite and from the associated production @xmath141 with @xmath142 @xcite , and compare them with the corresponding lep data which have been inputted into our code . we also consider the constraints from @xmath138 by looking for a peak of @xmath143 recoil mass distribution of @xmath1-boson @xcite and the constraint of @xmath144 mev when @xmath145 @xcite . + these constraints limit the quantities such as @xmath146 \times br ( h_i \to \bar{b } b ) $ ] on the @xmath147 plane with the the subscript @xmath148 denoting the coupling coefficient of the @xmath149 interaction . they also impose a model - dependent lower bound on @xmath150 , e.g. , @xmath151 for the type - ii 2hdm ( from our scan results ) , @xmath152 for the l2hdm@xcite , and @xmath153 for the nmssm @xcite . these bounds are significantly lower than that of the sm , i.e. @xmath154 , partially because in new physics models , unconventional decay modes of @xmath155 such as @xmath156 are open up . as to the nmssm , another specific reason for allowing a significantly lighter cp - even higgs boson is that the boson may be singlet - dominated in this model . + with regard to the lightest cp - odd higgs boson @xmath0 , we checked that there is no lower bound on its mass so long as the @xmath157 interaction is weak or @xmath155 is sufficiently heavy . * the constraints from the lep search for a light higgs boson via the yukawa process @xmath158 with @xmath22 and @xmath61 denoting a scalar @xcite . these constraints can limit the @xmath159 coupling versus @xmath160 in new physics models . * the constraints from the cleo - iii limit on @xmath161 and the latest babar limits on @xmath162 . these constraints will put very tight constraints on the @xmath163 coupling for @xmath164 . in our analysis , we use the results of fig.8 in the second paper of @xcite to excluded the unfavored points . * the constraints from @xmath165 couplings . since the higgs sector can give sizable higher order corrections to @xmath165 couplings , we calculate them to one loop level and require the corrected @xmath165 couplings to lie within the @xmath166 range of their fitted value . the sm predictions for the couplings at @xmath1-pole are given by @xmath167 and @xmath168 @xcite , and the fitted values are given by @xmath169 and @xmath170 , respectively@xcite . we adopt the formula in @xcite to the 2hdm in our calculation . * the constraints from @xmath171 leptonic decay . we require the new physics correction to the branching ratio @xmath172 to be in the range of @xmath173 @xcite . we use the formula in @xcite in our calculation . + about the constraints ( 5 ) and ( 6 ) , two points should be noted . one is all higgs bosons are involved in the constraints by entering the self energy of @xmath171 lepton , the @xmath174 vertex correction or the @xmath175 vertex correction , and also the box diagrams for @xmath176@xcite . since the yukawa couplings of the higgs bosons to @xmath171 lepton get enhanced by @xmath54 and so do the corrections , @xmath54 must be upper bounded for given spectrum of the higgs sector . generally speaking , the lighter @xmath0 is , the more tightly @xmath54 is limited@xcite . the other point is in the type - ii 2hdm , @xmath177 , b - physics observables as well as @xmath178 decays discussed above can constraint the model in a tighter way than the constraints ( 5 ) and ( 6 ) since the yukawa couplings of @xmath171 lepton and @xmath179 quark are simultaneously enhanced by @xmath54 . but for the l2hdm , because only the yukawa couplings of @xmath171 lepton get enhanced ( see eq.[yukawa ] ) , the constraints ( 5 ) and ( 6 ) are more important in limiting @xmath54 . * indirect constraints from the precision electroweak observables such as @xmath180 , @xmath181 and @xmath182 , or their combinations @xmath183 @xcite . we require @xmath184 to be compatible with the lep / sld data at @xmath185 confidence level@xcite . we also require new physics prediction of @xmath186 is within the @xmath187 range of its experimental value . the latest results for @xmath188 are @xmath189 ( measured value ) and @xmath190 ( sm prediction ) for @xmath191 gev @xcite . in our code , we adopt the formula for these observables presented in @xcite to the type - ii 2hdm and the l2hdm respectively . + in calculating @xmath180 , @xmath181 and @xmath182 , we note that these observables get dominant contributions from the self energies of the gauge bosons @xmath1 , @xmath192 and @xmath193 . since there is no @xmath194 coupling or @xmath195 coupling , @xmath0 must be associated with the other higgs bosons to contribute to the self energies . so by the uv convergence of these quantities , one can infer that , for the case of a light @xmath0 and @xmath196 , these quantities depend on the spectrum of the higgs sector in a way like @xmath197 at leading order , which implies that a light @xmath0 can still survive the constraints from the precision electroweak observables given the splitting between @xmath150 and @xmath198 is moderate@xcite . * the constraints from b physics observables such as the branching ratios for @xmath199 , @xmath200 and @xmath201 , and the mass differences @xmath202 and @xmath203 . we require their theoretical predications to agree with the corresponding experimental values at @xmath187 level . + in the type - ii 2hdm and the l2hdm , only the charged higgs boson contributes to these observables by loops , so one can expect that @xmath198 versus @xmath54 is to be limited . combined analysis of the limits in the type - ii 2hdm has been done by the ckmfitter group , and the lower bound of @xmath204 as a function of @xmath87 was given in fig.11 of @xcite . this analysis indicates that @xmath198 must be heavier than @xmath205 at @xmath185 c.l . regardless the value of @xmath54 . in this work , we use the results of fig.11 in @xcite to exclude the unfavored points . as for the l2hdm , b physics actually can not put any constraints@xcite because in this model the couplings of the charged higgs boson to quarks are proportional to @xmath206 and in the case of large @xmath54 which we are interested in , they are suppressed . in our analysis of the l2hdm , we impose the lep bound on @xmath198 , i.e. @xmath207@xcite . * the constraints from the muon anomalous magnetic moment @xmath208 . now both the theoretical prediction and the experimental measured value of @xmath208 have reached a remarkable precision , but a significant deviation still exists : @xmath209 @xcite . in the 2hdm , @xmath208 gets additional contributions from the one - loop diagrams induced by the higgs bosons and also from the two - loop barr - zee diagrams mediated by @xmath0 and @xmath155@xcite . if the higgs bosons are much heavier than @xmath25 lepton mass , the contributions from the barr - zee diagrams are more important , and to efficiently alleviate the discrepancy of @xmath208 , one needs a light @xmath0 along with its enhanced couplings to @xmath25 lepton and also to heavy fermions such as bottom quark and @xmath171 lepton to push up the effects of the barr - zee diagram@xcite . the cp - even higgs bosons are usually preferred to be heavy since their contributions to @xmath208 are negative . + in the type - ii 2hdm , because @xmath54 is tightly constrained by the process @xmath210 at the lep@xcite and the @xmath178 decay@xcite , the barr - zee diagram contribution is insufficient to enhance @xmath208 to @xmath187 range around its measured value@xcite . so in our analysis , we require the type - ii 2hdm to explain @xmath208 at @xmath211 level . while for the l2hdm , @xmath54 is less constrained compared with the type - ii 2hdm , and the barr - zee diagram involving the @xmath171-loop is capable to push up greatly the theoretical prediction of @xmath208@xcite . therefore , we require the l2hdm to explain the discrepancy at @xmath187 level . + unlike the other constraints discussed above , the @xmath208 constraint will put a two - sided bound on @xmath54 since on the one hand , it needs a large @xmath54 to enhance the barr - zee contribution , but on the other hand , too large @xmath54 will result in an unacceptable large @xmath208 . * since this paper concentrates on a light @xmath0 , the decay @xmath212 is open up with a possible large decay width . we require the width of any higgs boson to be smaller than its mass to avoid a too fat higgs boson@xcite . we checked that for the scenario characterized by @xmath213 , the coefficient of @xmath214 interaction is usually larger than the electroweak scale @xmath125 , and consequently a large decay width is resulted . for the nmssm and nmssm , the above constraints become more complicated because in these models , not only more higgs bosons are involved in , but also sparticles enter the constraints . so it is not easy to understand some of the constraints intuitively . take the process @xmath199 as an example . in the supersymmetric models , besides the charged higgs contribution , chargino loops , gluino loops as well as neutralino loops also contribute to the process@xcite , and depending on the susy parameters , any of these contributions may become dominated over or be canceled by other contributions . as a result , although the charged higgs affects the process in the same way as that in the type - ii 2hdm , charged higgs as light as @xmath215 is still allowed even for @xmath216@xcite .    since among the constraints , @xmath208 is rather peculiar in that it needs new physics to explain the discrepancy between @xmath217 and @xmath218 , we discuss more about its dependence on susy parameters . in the nmssm and the nmssm , @xmath208 receives contributions from higgs loops and neutralino / chargino loops . for the higgs contribution , it is quite similar to that of the type - ii 2hdm except that more higgs bosons are involved in@xcite . for the neutralino / chargino contribution , in the light bino limit ( i.e. @xmath219 ) , it can be approximated by@xcite @xmath220 for @xmath221 with @xmath222 being smuon mass . so combining the two contributions together , one can learn that a light @xmath0 along with large @xmath54 and/or light smuon with moderate @xmath87 are favored to dilute the discrepancy .    because more parameters are involved in the constraints on the supersymmetric models , we consider following additional constraints to further limit their parameters :    * direct bounds on sparticle masses from the lep1 , the lep2 and the tevatron experiments @xcite . * the lep1 bound on invisible z decay @xmath223 ; the lep2 bound on neutralino production @xmath224 and @xmath225@xcite . * dark matter constraints from the wmap relic density 0.0975 @xmath226 0.1213 @xcite . note that among the above constraints , the constraint ( 2 ) on higgs sector and the constraint ( c ) on neutralino sector are very important . this is because in the supersymmetric models , the sm - like higgs is upper bounded by about @xmath227 at tree level and by about @xmath228 at loop level , and that the relic density restricts the lsp annihilation cross section in a certain narrow range .    in our analysis of the nmssm , we calculate the constraints ( 3 ) and ( 5 - 7 ) by ourselves and utilize the code nmssmtools @xcite to implement the rest constraints . we also extend nmssmtools to the nmssm to implement the constraints . for the extension , the most difficult thing we faced is how to adapt the code micromegas@xcite to the nmssm case . we solve this problem by noting the following facts :    * as we mentioned before , the nmssm is actually same as the nmssm with the trilinear singlet term setting to zero . so we can utilize the model file of the nmssm as the input of the micromegas and set @xmath229 . * since in the nmssm , the lsp is too light to annihilate into higgs pairs , there is no need to reconstruct the effective higgs potential to calculate precisely the annihilation channel @xmath230 with @xmath61 denoting any of higgs bosons@xcite . we thank the authors of the nmssmtools for helpful discussion on this issue when we finish such extension@xcite . with the above constraints , we perform four independent random scans over the parameter space of the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively . we vary the parameters in following ranges : @xmath231 for the type - ii 2hdm , @xmath232 for the l2hdm , @xmath233 for the nmssm , and @xmath234 for the nmssm .    in performing the scans , we note that for the nmssm and the nmssm , some constraints also rely on the gaugino masses and the soft breaking parameters in the squark sector and the slepton sector . since these parameters affect little on the properties of @xmath0 , we fix them to reduce the number of free parameters in our scan . for the squark sector , we adopt the @xmath235 scenario which assumes that the soft mass parameters for the third generation squarks are degenerate : @xmath236 800 gev , and that the trilinear couplings of the third generation squarks are also degenerate , @xmath237 with @xmath238 . for the slepton sector , we assume all the soft - breaking masses and trilinear parameters to be 100 gev . this setting is necessary for the nmssm since this model is difficult to explain the muon anomalous moment at @xmath239 level for heavy sleptons@xcite . finally , we assume the grand unification relation @xmath240 for the gaugino masses with @xmath241 being fine structure constants of the different gauge group .    with large number of random points in the scans , we finally get about @xmath242 , @xmath243 , @xmath244 and @xmath242 samples for the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively which survive the constraints and satisfy @xmath245 . analyzing the properties of the @xmath0 indicates that for most of the surviving points in the nmssm and the nmssm , its dominant component is the singlet field ( numerically speaking , @xmath246 ) so that its couplings to the sm fermions are suppressed@xcite . our analysis also indicates that the main decay products of @xmath0 are @xmath247 for the l2hdm@xcite , @xmath248 ( dominant ) and @xmath247 ( subdominant ) for the type - ii 2hdm , the nmssm and the nmssm , and in some rare cases , neutralino pairs in the nmssm@xcite .    in fig.[fig4 ] , we project the surviving samples on the @xmath249 plane . this figure shows that the allowed range of @xmath54 is from @xmath250 to @xmath251 in the type - ii 2hdm , and from @xmath252 to @xmath253 in the l2hdm . just as we introduced before , the lower bounds of @xmath254 come from the fact that we require the models to explain the muon anomalous moment , while the upper bound is due to we have imposed the constraint from the lep process @xmath255 , which have limited the upper reach of the @xmath256 coupling for light @xmath61 @xcite(for the dependence of @xmath256 coupling on @xmath54 , see sec . this figure also indicates that for the nmssm and the nmssm , @xmath54 is upper bounded by @xmath257 . for the nmssm , this is because large @xmath87 can suppress the dark matter mass to make its annihilation difficult ( see @xcite and also sec . ii ) , but for the nmssm , this is because we choose a light slepton mass so that large @xmath54 can enhance @xmath208 too significantly to be experimentally unacceptable . we checked that for the slepton mass as heavy as @xmath258 , @xmath259 is still allowed for the nmssm .    in fig.[fig5 ] and fig.[fig6 ] , we show the branching ratios of @xmath260 and @xmath261 respectively . fig.[fig5 ] indicates , among the four models , the type - ii 2hdm predicts the largest ratio for @xmath260 with its value varying from @xmath262 to @xmath263 . the underlying reason is in the type - ii 2hdm , the @xmath264 coupling is enhanced by @xmath54 ( see fig.[fig4 ] ) , while in the other three model , the coupling is suppressed either by @xmath265 or by the singlet component of the @xmath0 . fig.[fig6 ] shows that the l2hdm predicts the largest rate for @xmath266 with its value reaching @xmath5 in optimum case , and for the other three models , the ratio of @xmath261 is at least about one order smaller than that of @xmath267 . this feature can be easily understood from the @xmath268 coupling introduced in sect . we emphasize that , if the nature prefers a light @xmath0 , @xmath260 and/or @xmath269 in the type - ii 2hdm and the l2hdm will be observable at the gigaz . then by the rates of the two decays , one can determine whether the type - ii 2hdm or the l2hdm is the right theory . on the other hand , if both decays are observed with small rates or fail to be observed , the singlet extensions of the mssm are favored .    in fig.[fig7 ] , we show the rate of @xmath3 as the function of @xmath270 . this figure indicates that the branching ratio of @xmath121 can reach @xmath271 , @xmath272 , @xmath273 and @xmath274 for the optimal cases of the type - ii 2hdm , the l2hdm , the nmssm and the nmssm respectively , which implies that the decay @xmath121 will never be observable at the gigaz if the studied model is chosen by nature . the reason for the smallness is , as we pointed out before , that the decay @xmath121 proceeds only at loop level . comparing the optimum cases of the type - ii 2hdm , the nmssm and the nmssm shown in fig.5 - 7 , one may find that the relation @xmath275 holds for any of the decays . this is because the decays are all induced by the yukawa couplings with similar structure for the models . in the supersymmetric models , the large singlet component of the light @xmath0 is to suppress the yukawa couplings , and the @xmath0 in the nmssm has more singlet component than that in the nmssm . next we consider the decay @xmath11 , which , unlike the above decays , depends on the higgs self interactions . in fig.[fig8 ] we plot its rate as a function of @xmath270 and this figure indicates that the @xmath276 may be the largest among the ratios of the exotic @xmath1 decays , reaching @xmath277 in the optimum cases of the type - ii 2hdm , the l2hdm and the nmssm . the underlying reason is , in some cases , the intermediate state @xmath119 in fig.[fig3 ] ( a ) may be on - shell . in fact , we find this is one of the main differences between the nmssm and the nmssm , that is , in the nmssm , @xmath119 in fig.[fig3 ] ( a ) may be on - shell ( corresponds to the points with large @xmath278 ) while in the nmssm , this seems impossible . so we conclude that the decay @xmath11 may serve as an alternative channel to test new physics models , especially it may be used to distinguish the nmssm from the nmssm if the supersymmetry is found at the lhc and the @xmath11 is observed at the gigaz with large rate . before we end our discussion , we note that in the nmssm , the higgs boson @xmath0 may be lighter than @xmath279 without conflicting with low energy data from @xmath178 decays and the other observables ( see fig.[fig4]-[fig8 ] ) . in this case , @xmath0 is axion - like as pointed out in @xcite . we checked that , among the rare @xmath1 decays discussed in this paper , the largest branching ratio comes from @xmath280 which can reach @xmath281 . since in this case , the decay product of @xmath0 is highly collinear muon pair , detecting the decay @xmath280 may need some knowledge about detectors , which is beyond our discussion . in this paper , we studied the rare @xmath1-decays @xmath2 ( @xmath7 ) , @xmath282 and @xmath4 in the type - ii 2hdm , lepton - specific 2hdm , nmssm and nmssm , which predict a light cp - odd higgs boson @xmath0 . in the parameter space allowed by current experiments , the branching ratio can be as large as @xmath5 for @xmath118 , @xmath8 for @xmath3 and @xmath9 for @xmath4 , which implies that the decays @xmath2 and @xmath283 may be accessible at the gigaz option . since different models predict different size of branching ratios , these decays can be used to distinguish different model through the measurement of these rare decays . this work was supported in part by hastit under grant no . 2009hastit004 , by the national natural science foundation of china ( nnsfc ) under grant nos . 10821504 , 10725526 , 10635030 , 10775039 , 11075045 and by the project of knowledge innovation program ( pkip ) of chinese academy of sciences under grant no . .        for some reviews , see , e.g. , m.  a.  perez , g.  tavares - velasco and j.  j.  toscano , int . j.  mod . a * 19 * , 159 ( 2004 ) ; j. m. yang , arxiv:1006.2594 . j.  i.  illana , m.  masip , 67 , 035004 ( 2003 ) ; j. cao , z. xiong , j. m. yang , 32 , 245 ( 2004 ) . d. atwood _ et al_. , 66 , 093005 ( 2002 ) . j. kalinowski , and s. pokorski , 219 , 116 ( 1989 ) ; a. djouadi , p. m. zerwas and j. zunft , 259 , 175 ( 1991 ) ; a. djouadi , j. kalinowski , and p. m. zerwas , z. phys . c * 54 * , 255 ( 1992 ) . m. krawczyk , _ et al . _ , 19 , 463 ( 2001 ) ; 8 , 495 ( 1999 ) . j. f. gunion , g. gamberini and s. f. novaes , 38 , 3481 ( 1988 ) ; thomas j. weiler and tzu - chiang yuan , 318 , 337 ( 1989 ) ; a. djouadi , _ et al . _ , 1 , 163 ( 1998)[hep - ph/9701342 ] . d.  chang and w.  y.  keung , phys . lett .  * 77 * , 3732 ( 1996 ) . e.  keith and e.  ma , 57 , 2017 ( 1998 ) ; m.  a.  perez , g.  tavares - velasco and j.  j. toscano , int . j.  mod.phys . a * 19 * , 159 ( 2004 ) . f.  larios , g.  tavares - velasco and c. p.  yuan , 64 , 055004 ( 2001 ) ; 66 , 075006 ( 2002 ) . a. djouadi , _ et al . _ , 10 , 27 ( 1999 ) [ hep - ph/9903229 ] . for a detailed introduction of the nmssm , see f.  franke and h. fraas , int . j.  mod . a * 12 * ( 1997 ) 479 ; for a recent review of the nmssm , see for example , u. ellwanger , c. hugonie , and a. m. teixeira , arxiv : 0910.1785 . see , e.g. , j.  r.  ellis , j.  f.  gunion , h.  e.  haber , l.  roszkowski and f.  zwirner , phys .  rev . d * 39 * ( 1989 ) 844 ; m.  drees , int . j.  mod . phys .  a * 4 * ( 1989 ) 3635 ; u.  ellwanger , m.  rausch de traubenberg and c.  a.  savoy , phys . b * 315 * ( 1993 ) 331 ; nucl . b * 492 * ( 1997 ) 21 ; d.j . miller , r. nevzorov , p.m. zerwas , 681 , 3 ( 2004 ) .    c.  panagiotakopoulos , k.  tamvakis , 446 , 224 ( 1999 ) ; 469 , 145 ( 1999 ) ; c. panagiotakopoulos , a. pilaftsis , 63 , 055003 ( 2001 ) ; a.  dedes , _ et al . _ , 63 , 055009 ( 2001 ) ; a.  menon , _ et al . _ , 70 , 035005 ( 2004 ) ; v.  barger , _ et al . _ , 630 , 85 ( 2005 ) . c.  balazs , _ et al . _ , 0706 , 066 ( 2007 ) . b. a. dobrescu , k. t. matchev , 0009 , 031 ( 2000 ) ; a. arhrib , k. cheung , t. j. hou , k. w. song , hep - ph/0611211 ; 0703 , 073 ( 2007 ) ; x. g. he , j. tandean , and g. valencia , 98 , 081802 ( 2007 ) ; 0806 , 002 ( 2008 ) ; f. domingo _ et al_. , 0901 , 061 ( 2009 ) ; gudrun hiller , 70 , 034018 ( 2004 ) ; r. dermisek , and john f. gunion , 75 , 075019 ( 2007 ) ; 79 , 055014 ( 2009 ) ; 81 , 055001 ( 2010 ) ; r. dermisek , john f. gunion , and b. mcelrath , 76 , 051105 ( 2007 ) ; z. heng , _ et al_. , 77 , 095012 ( 2008 ) ; a. belyaev _ et al_. , 81 , 075021 ( 2010 ) ; d. das and u.  ellwanger , arxiv:1007.1151 [ hep - ph ] . s.  andreas , o.  lebedev , s.  ramos - sanchez and a.  ringwald , arxiv:1005.3978 [ hep - ph ] . j.  f.  gunion , jhep * 0908 * , 032 ( 2009 ) ; r. dermisek and j.  f.  gunion , phys .  rev . d * 81 * , 075003 ( 2010 ) . r.  dermisek and j.  f. gunion , phys . lett .   * 95 * , 041801 ( 2005 ) ; phys . d * 73 * , 111701 ( 2006 ) . j. cao , h. e. logan , j. m. yang , 79 , 091701 ( 2009 ) . j. cao , p. wan , l. wu , j. m. yang , 80 , 071701 ( 2009 ) . j. f. gunion and h. e. haber , 67 , 075019 ( 2003 ) . r.  m.  barnett , _ et al . _ , phys . b * 136 * , 191 ( 1984 ) ; r.  m.  barnett , g.  senjanovic and d.  wyler , phys . d * 30 * , 1529 ( 1984 ) ; y.  grossman , nucl . b * 426 * , 355 ( 1994 ) . h.  s.  goh , l.  j.  hall and p. kumar , jhep * 0905 * , 097 ( 2009 ) ; a.  g. akeroyd and w.  j.  stirling , nucl . b * 447 * , 3 ( 1995 ) ; a.  g.  akeroyd , phys . b * 377 * , 95 ( 1996 ) ; h.  e.  logan and d.  maclennan , phys .  rev . d * 79 * , 115022 ( 2009 ) ; m. aoki , _ et al . _ , arxiv:0902.4665 [ hep - ph ] . v.  barger , p.  langacker , h.  s.  lee and g. shaughnessy , phys . d * 73 * , 115010 ( 2006 ) . s. hesselbach , _ et . _ , arxiv:0810.0511v2 [ hep - ph ] . de vivie and p.  janot [ aleph collaboration ] , pa13 - 027 contribution to the international conference on high energy physics , warsaw , poland , 2531 july 1996 ; j. kurowska , o.  grajek and p.  zalewski [ delphi collaboration ] , cern - open-99 - 385 . [ aleph collaboration and delphi collaboration and l3 collaboration ] , phys . rept .   * 427 * , 257 ( 2006 ) . j.  cao and j.  m.  yang , jhep * 0812 * , 006 ( 2008 ) . m.  krawczyk and d.  temes , eur . j.   c * 44 * , 435 ( 2005 ) . g.  altarelli and r.  barbieri , 253 , 161 ( 1991 ) ; m. e. peskin , t. takeuchi , 46 , 381 ( 1992 ) . c. amsler , _ et al . _ , ( particle data group ) , 667 , 1 ( 2008 ) . o. deschamps , s.  descotes - genon , s.  monteil , v.  niess , s.  tjampens and v.  tisserand , arxiv:0907.5135 [ hep - ph ] . s.  su and b. thomas , phys . d * 79 * , 095014 ( 2009 ) . g. abbiendi , _ et al . _ , eur .  phys . j.   c * 32 * , 453 ( 2004 ) . m.  davier , _ et al . _ , 66 , 1 ( 2010 ) . k.  cheung , _ et al . _ , phys . d * 64 * , 111301 ( 2001 ) . k.  cheung and o.  c.  w. kong , phys . d * 68 * , 053003 ( 2003 ) . t. besmer , c. greub , t.hurth , 609 , 359 ( 2001 ) ; f. borzumati , _ et al . _ , 62 , 075005(2000 ) . j.  cao , k.  i.  hikasa , w.  wang , j.  m.  yang and l.  x.  yu , phys . d * 82 * , 051701 ( 2010 ) [ arxiv:1006.4811 [ hep - ph ] ] . j.  f.  gunion , _ et . d * 73 * , 015011 ( 2006 ) . martin and j.  d.  wells , phys . d * 64 * , 035003 ( 2001 ) . j.  abdallah _ et al . _ , eur . j.   c * 31 * , 421 ( 2004 ) ; g.  abbiendi _ et al . _ , eur . j. c * 35 * , 1 ( 2004 ) . j.  dunkley _ et al . _ [ wmap collaboration ] , astrophys . j.  suppl . * 180 * , 306 ( 2009 ) [ arxiv:0803.0586 [ astro - ph ] ] . u. ellwanger _ et al . _ , 02 , 066 ( 2005 ) . g.  belanger , f.  boudjema , a.  pukhov and a.  semenov , comput . commun .   * 174 * , 577 ( 2006 ) ; comput . phys .  commun . * 176 * , 367 ( 2007 ) . g.  belanger , f.  boudjema , c. hugonie , a.  pukhov and a.  semenov , jcap * 0509 * , 001 ( 2005 ) ."""
-
-        ARTICLE_MAGNET = r"""it is well known that the classical magnetoresistance ( mr ) in metals or semiconductors with a closed free electron fermi surface increases quadratically with increasing magnetic field @xmath2 for @xmath3 and saturates when @xmath4 . here @xmath5 is the zero - magnetic - field mobility . hence , the extraordinarily high and linear mr ( lmr ) , which breaks this familiar rule , has been gaining much attention as soon as its discovery . in the past decade , this unexpected lmr has been reported in silver chalcogenide,@xcite indium antimonide,@xcite silicon,@xcite mnas - gaas composite material,@xcite and graphene.@xcite    kapitza s linear law@xcite indicates that the metal shows a magnetoresistance linear in perpendicular magnetic field when it has an open fermi surface and a mean free path longer than the electronic larmor radius . recently , another two models , irrespective of the open fermi surface , have been constructed to provide possible mechanisms for the lmr phenomenon . abrikosov suggested a quantum - limit origin of lmr for the homogenous system with a gapless linear energy spectrum.@xcite his model requires that landau levels are well formed and the carrier concentration is small that all electrons occupy only the lowest landau band . alternatively , parish and littlewood developed a classical model without involving linear spectrum.@xcite ignoring the concrete microscopic mechanism , they attributed this unusual mr to the mobility fluctuations in a strongly inhomogenous system . topological insulators@xcite ( tis ) are novel materials with a full energy gap in bulk , while there are gapless surface states . due to its unique band structure with only one helical dirac cone and linear energy dispersion,@xcite the surface states of the ti bi@xmath0se@xmath1 become an excellent platform for the study of quantum - limit lmr . the recent experiment in this flat surface system , however , reported that a large positive mr , which becomes very linear above a characteristic field of @xmath6@xmath7@xmath8 t , was observed even in an opposite situation where the carrier sheet density is high that electrons occupy more than one landau levels.@xcite moreover , they found that raising temperature to room temperature almost has no influence on the observed lmr . it is striking that this observation is in conflict with abrikosov s model and also with the classical parish - littlewood model . so far a reliable theoretical scheme capable of explaining this novel experiment has still been lacking .    in this paper , we generalize the balance - equation approach@xcite to a system modeling the surface states of a three - dimensional ti to investigate the two - dimensional magnetotransport in it . we find that a positive , nonsaturating and dominantly linear magnetoresistance can appear within quite wide magnetic - field range in the ti surface state having a positive and finite effective g - factor . this linear magnetoresistance shows up in the system of high carrier concentration and low mobility when electrons are in extended states and spread over many smeared landau levels , and persists up to room temperature , providing a possible mechanism for the recently observed linear magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons.@xcite we consider the surface state of a bi@xmath0se@xmath1-type large bulk gap ti in the @xmath9-@xmath10 plane under the influence of a uniform magnetic field @xmath11 applied along the @xmath12 direction.@xcite following the experimental observation,@xcite we assume that the fermi energy locates in the gap of the bulk band and above the dirac point , i.e. the surface carriers are electrons . further , the separations of the fermi energy from the bottom of bulk band and dirac point are much larger than the highest temperature ( @xmath13 ) considered in this work . hence , the contribution from the bulk band to the magnetotransport is negligible . these electrons , scattered by randomly distributed impurities and by phonons , are driven by a uniform in - plane electric field @xmath14 in the topological surface . the hamiltonian of this many - electron and phonon system consists of an electron part @xmath15 , a phonon part @xmath16 , and electron - impurity and electron - phonon interactions @xmath17 and @xmath18 : @xmath19 here , the electron hamiltonian is taken in the form @xmath20 , \ ] ] in which @xmath21 , @xmath22 , @xmath23 and @xmath24 , stand , respectively , for the canonical momentum , coordinate , momentum and spin operators of the @xmath25th electron having charge @xmath26 , @xmath27 is the vector potential of the perpendicular magnetic field @xmath28 in the landau gauge , @xmath29 is the fermi velocity , @xmath30 is the effective g - factor of the surface electron , and @xmath31 is the bohr magneton with @xmath32 the free electron mass . the sum index @xmath25 in eq.([helectron ] ) goes over all electrons of total number @xmath33 in the surface state of unit area .    in the frame work of balance equation approach,@xcite the two - dimensional center - of - mass ( c.m . ) momentum and coordinate @xmath34 and @xmath35 , and the relative - electron momenta and coordinates @xmath36 and @xmath37 are introduced to write the hamiltonian @xmath15 into the sum of a single - particle c.m . part @xmath38 and a many - particle relative - electron part @xmath39 : @xmath40 , with @xmath41.\end{aligned}\ ] ] in this , @xmath42 is the canonical momentum of the center - of - mass and @xmath43 is the canonical momentum for the @xmath25th relative electron . here we have also introduced c.m . spin operators @xmath44 and @xmath45 . the commutation relations between the c.m . spin operators @xmath46 and @xmath47 and the spin operators @xmath48 , @xmath49 and @xmath50 of the @xmath25th electron are of order of @xmath51 : @xmath52= n^{-1}2\,{\rm i}\,\varepsi lon_{\beta_1\beta_2\beta_3}\sigma_j^{\beta_3}$ ] with @xmath53 . therefore , for a macroscopic large @xmath33 system , the c.m . part @xmath38 actually commutes with the relative - electron part @xmath54 in the hamiltonian , i.e. the c.m . motion and the relative motion of electrons are truly separated from each other . the couplings between the two emerge only through the electron impurity and electron  phonon interactions . furthermore , the electric field @xmath55 shows up only in @xmath38 . and , in view of @xmath56={\rm i}\delta_{\alpha \beta}(\delta_{ij}-1/n)\simeq { \rm i}\delta_{\alpha\beta}\delta_{ij}$ ] , i.e. the relative - electron momenta and coordinates can be treated as canonical conjugate variables , the relative - motion part @xmath54 is just the hamiltonian of @xmath33 electrons in the surface state of ti in the magnetic field without the presence of the electric field .    in terms of the c.m . coordinate @xmath57 and the relative electron density operator @xmath58 , the electron impurity and electron  phonon interactions can be written as@xcite @xmath59 here @xmath60 and @xmath61 are respectively the impurity potential ( an impurity at randomly distributed position @xmath62 ) and electron  phonon coupling matrix element in the plane - wave representation , and @xmath63 with @xmath64 and @xmath65 being the creation and annihilation operators for a phonon of wavevector @xmath66 in branch @xmath67 having frequency @xmath68 . velocity ( operator ) @xmath69 is the time variation of its coordinate : @xmath70= v_{\rm f}(\sigma_{\rm c}^y\ , \hat{i}-\sigma_{\rm c}^x\ , \hat{j})$ ] . to derive a force - balance equation for steady state transport we consider the heisenberg equation for the rate of change of the c.m . canonical momentum @xmath71 : @xmath72= - n e({\bm v}\times { \bm b})- n e{\bm e}+{\bm { f}}_{\rm i}+{\bm { f}}_{\rm p},\ ] ] in which the frictional forces @xmath73 and @xmath74 share the same expressions as given in ref ..    the statistical average of the operator equation can be determined to linear order in the electron  impurity and electron phonon interactions @xmath17 and @xmath18 with the initial density matrix @xmath75 at temperature @xmath76 when the in - plane electric field @xmath77 is not strong . for steady - transport states we have @xmath78 , leading to a force - balance equation of the form @xmath79 here @xmath80 , the statistically averaged velocity of the moving center - of - mass , is identified as the average rate of change of its position , i.e. the drift velocity of the electron system driven by the electric field @xmath77 , and @xmath81 and @xmath82 are frictional forces experienced by the center - of - mass due to impurity and phonon scatterings : @xmath83,\label{fp}\end{aligned}\ ] ] in which @xmath84 is the bose distribution function , @xmath85 , and @xmath86 stands for the imaginary part of the fourier spectrum of the relative - electron density correlation function defined by @xmath87\big\rangle_{0},\ ] ] where @xmath88 and @xmath89 denotes the statistical averaging over the initial density matrix @xmath90.@xcite    the force - balance equation describes the steady - state two - dimensional magnetotransport in the surface state of a ti . note that the frictional forces @xmath81 and @xmath82 are in the opposite direction of the drift velocity @xmath91 and their magnitudes are functions of @xmath92 only . with the drift velocity @xmath93 in the @xmath9 direction , the force - balance equation eq . yields a transverse resistivity @xmath94 , and a longitudinal resistivity @xmath95 . the linear one is in the form @xmath96 for calculating the electron density correlation function @xmath97 we proceed in the landau representation.@xcite the landau levels of the single - particle hamiltonian @xmath98 of the relative - electron system in the absence of electric field are composed of a positive `` @xmath99 '' and a negative `` @xmath100 '' branch@xcite @xmath101 with @xmath102 and @xmath103 , and a zero ( @xmath104 ) level @xmath105 the corresponding landau wave functions are @xmath106 and @xmath107 for @xmath108 ; and @xmath109 for @xmath104 . here @xmath110 is the wavevector of the system along @xmath9 direction ; @xmath111 with @xmath112 ; and @xmath113 is the harmonic oscillator eigenfunction with @xmath114 being the hermite polynomial , @xmath115 , and @xmath116 . each landau level contains @xmath117 electron states for system of unit surface area . the positive branch @xmath118 and the @xmath104 level @xmath119 of the above energy spectra are indeed quite close to those of the surface states in the bulk gap of bi@xmath0se@xmath1-family materials derived from microscopic band calculation.@xcite    the landau levels are broadened due to impurity , phonon and electron - electron scatterings . we model the imaginary part of the retarded green s function , or the density - of - states , of the broadened landau level @xmath120 ( written for `` + ' ' -branch and @xmath104 levels ) , using a gaussian - type form:@xcite @xmath121,\ ] ] with a half - width @xmath122 of the form:@xcite @xmath123^{1/2}$ ] . here @xmath124 is the single - particle lifetime and @xmath125 is the cyclotron frequency of linear - energy - dispersion system with @xmath126 being the zero - temperature fermi level . using a semi - empirical parameter @xmath127 to relate @xmath124 with the transport scattering time @xmath128 , and expressing @xmath129 with the zero - field mobility @xmath5 at finite temperature,@xcite we can write the landau - level broadening as @xmath130^{1/2}.\ ] ]    in the present study we consider the case of @xmath120-doping , i.e. the fermi level is high enough above the energy zero of the dirac cone in the range of `` + ' ' -branch levels and the states of `` @xmath100''-branch levels are completely filled , that they are irrelevant to electron transport . special attention has to be paid to the @xmath104 level , since , depending on the direction of exchange potential the effective g - factor of a ti surface state , @xmath30 , can be positive , zero or negative.@xcite the sign and magnitude of the effective g - factor determines how many states of the zero level should be included in or excluded from the available states for electron occupation in the case of @xmath120-doping at a magnetic field . ( i ) if @xmath131 , the @xmath104 level center is exactly at @xmath132 and the system is electron - hole symmetric . the total number of negative energy states ( including the states of the lower half of the @xmath104 level and states of the @xmath100"-branch levels ) and that of positive energy states ( including the states of the upper half of the @xmath104 level and states of the @xmath99"-branch levels ) do not change when changing magnetic field . therefore , the lower - half negative energy states of this level are always filled and the upper - half positive - energy states of it are available for the occupation of particles which are counted as electrons participating in transport in the case of @xmath120-doping . ( ii ) for a finite positive @xmath133 , the @xmath104 level @xmath134 moves downward to negative energy and its distance to the nearest  @xmath100"-branch level is @xmath135 closer than to the nearest  + " -branch level at finite magnetic field strength @xmath2 . this is equivalent to the opening of an increasingly enlarged ( with increasing @xmath2 ) energy gap between the  + " -branch states and the states of the zero - level and the  @xmath100"-branch levels . the opening of a sufficient energy gap implies that with increasing magnetic field the states in the  + " -branch levels would no longer shrink into the zero - level , and thus the @xmath104 level should be completely excluded from the conduction band , i.e. only particles occupying the  + " -branch states are counted as electrons participating in transport in the case of @xmath120-doping , when the magnetic field @xmath2 gets larger than a certain value ( depending on the magnitude of @xmath30 ) . ( iii ) for a finite negative @xmath136 , the @xmath104 level @xmath134 moves upward to positive energy and an increasingly enlarged energy gap will be opened between the states of the zero - level and the  + " -branch and the states of  @xmath100"-branch levels , and particles occupying the @xmath104 level and  + " -branch states are electrons participating in transport when the magnetic field @xmath2 gets larger than a certain value .    as a result , the experimentally accessible sheet density @xmath33 of electrons participating in transport is related to the fermi energy @xmath137 by the following equation valid at finite @xmath30 for the magnetic field @xmath2 larger than a certain value : @xmath138 in which @xmath139 + 1\}^{-1}$ ] is the fermi distribution function at temperature @xmath76 and the summation index @xmath120 goes over @xmath140 for @xmath133 , or @xmath141 for @xmath136 . in the case of @xmath131 , @xmath142\ ] ] valid for arbitrary magnetic field , in which @xmath143 . the imaginary part of relative - electron density correlation function in the presence of a magnetic field , @xmath86 , can be expressed in the landau representation as@xcite @xmath144 in which the transform factor @xmath145 ^ 2,\end{aligned}\ ] ] with @xmath146 , @xmath147 , @xmath148 , and @xmath149 being associated laguerre polynomials . the landau - representation correlation function @xmath150 in eq.([piqw ] ) can be constructed with the imaginary part of the retarded green s function @xmath151 , or the density - of - states , of the @xmath120th landau level as@xcite @xmath152\nonumber\\ & \hspace{1.2cm}\times{\rm im}g_n(\epsilon+\omega){\rm im}g_{n'}(\epsilon).\end{aligned}\ ] ] the summation indices @xmath120 and @xmath153 in eq.([piqw ] ) are taken over @xmath140 for @xmath133 , or @xmath154 for @xmath136 . in the case of @xmath131 , eq.([piqw ] ) still works and the summation indices @xmath120 and @xmath153 go over @xmath154 but with @xmath155 replaced by @xmath156 in eq.([p2nn ] ) . numerical calculations are performed for the magnetoresistivity @xmath157 of surface state in a uniform ti bi@xmath0se@xmath1 . at zero temperature the elastic scattering contributing to the resistivity is modeled by a coulomb potential due to charged impurities:@xcite @xmath158 with @xmath159 being the impurity density , which is determined by the zero - magnetic - field mobility @xmath5 . at temperatures higher than @xmath160,@xcite phonon scatterings play increasingly important role and the dominant inelastic contribution comes from optical phonons . for this polar material , the scattering by optical phonons via the deformation potential can be neglected . hence , we take account of inelastic scattering from optical phonons via frhlich coupling : @xmath161 . in the numerical calculation we use the following parameters:@xcite fermi velocity @xmath162 , static dielectric constant @xmath163 , optical dielectric constant @xmath164 , and phonon energy @xmath165 . the broadening parameter is taken to be @xmath166 . as a function of the magnetic field @xmath2 having different effective g - factors : @xmath167 and @xmath168 for a ti surface system with electron sheet density @xmath169 in the cases of zero - magnetic - field mobility @xmath170 ( a ) and @xmath171 ( b ) . several integer - number positions of filling factor @xmath172 are marked in ( b).,scaledwidth=40.0% ]    fig.[diffg ] shows the calculated magnetoresistivity @xmath157 versus the magnetic field strength @xmath2 for a ti surface system with electron sheet density @xmath169 but having different effective g - factors : @xmath167 and @xmath168 for two values of zero - magnetic - field mobility @xmath170 and @xmath171 , representing different degree of landau - level broadening . in the case without zeeman splitting ( @xmath131 ) the resistivity @xmath157 exhibits almost no change with changing magnetic field up to 10 t , except the shubnikov - de haas ( sdh ) oscillation showing up in the case of @xmath171 . this kind of magnetoresistance behavior was indeed seen experimentally in the electron - hole symmetrical massless system of single - layer graphene.@xcite in the case of a positive g - factor , @xmath173 , the magnetoresistivity increases linearly with increasing magnetic field ; while for a negative g - factor , @xmath174 , the magnetoresistivity decreases linearly with increasing magnetic field . is shown as a function of the magnetic field @xmath2 for different values of zero - magnetic - field mobility : ( a ) @xmath175 , ( b ) @xmath176 , ( c ) @xmath177 , ( d ) @xmath178 , ( e ) @xmath179 , and ( f ) @xmath180 . the inset of ( a ) illustrates the same for a larger magnetic - field range @xmath181 . the filling factor @xmath182 is plotted versus the magnetic field in ( f ) ; and several integer - number positions of @xmath182 are also marked in ( d ) and ( e ) . here the surface electron density @xmath169 and the lattice temperature @xmath183.,scaledwidth=47.0% ]    in the following we will give more detailed examination on the linearly increasing magnetoresistance in the positive @xmath30 case . fig.[rhob ] shows the calculated resistivity @xmath157 versus the magnetic field strength @xmath2 at lattice temperature @xmath183 for system of carrier sheet density @xmath169 and @xmath173 , having different zero - field mobility @xmath184 and @xmath180 . all resistivity curves for mobility @xmath185 exhibit clear linearity in the magnetic - field range and appear no tendency of saturation at the highest field shown in the figure . especially , for the case @xmath170 , the linear behavior extends even up to the magnetic field of @xmath186 , as illustrated in the inset of fig.[rhob](a ) . this feature contradicts the classical mr which saturates at sufficiently large magnetic field @xmath187 . note that here we only present the calculated @xmath157 for magnetic field @xmath2 larger than @xmath188 t , for which a sufficient energy gap @xmath135 is assumed to open that with further increase of the magnetic field the states in the `` + ' ' -branch levels no longer shrink into the zero level and thus it should be excluded from the conduction band . this is of course not true for very weak magnetic field . when @xmath189 the energy gap @xmath190 , the situation becomes similar to the case of @xmath131 : the whole upper half of the zero - level states are available to electron occupation and we should have a flat resistivity @xmath157 when changing magnetic field . with increasing @xmath2 the portion of the zero - level states available to conduction electrons decreases until the magnetic field reaches @xmath191 . as a result the resistivity @xmath157 should exhibit a crossover from a flat changing at small @xmath2 to positively linear increasing at @xmath192 . this is just the behavior observed in the ti bi@xmath0se@xmath1.@xcite    note that in the case of @xmath170 , the broadened landau - level widths are always larger than the neighboring level interval : @xmath193 , which requires @xmath194 ^ 2 $ ] , even for the lowest landau level @xmath195 , i.e. the whole landau - level spectrum is smeared . with increasing the zero - field mobility the magnitude of resistivity @xmath157 decreases , and when the broadened landau - level width becomes smaller than the neighboring level interval , @xmath196 , a weak sdh oscillation begin to occur around the linearly - dependent average value of @xmath157 at higher portion of the magnetic field range , as seen in fig.[rhob](c ) , ( d ) and ( e ) for @xmath197 and @xmath198 . on the other hand , in the case of large mobility , e.g. @xmath199 , where the broadened landau - level widths @xmath200 are much smaller than the neighboring level interval even for level index @xmath120 as large as @xmath201 , the magnetoresistivity shows pronounced sdh oscillation and the linear - dependent behavior disappears , before the appearance of quantum hall effect,@xcite as shown in fig.[rhob](f ) . abrikosov s model for the lmr requires the applied magnetic field large enough to reach the quantum limit at which all the carriers are within the lowest landau level,@xcite while it is obvious that more than one landau levels are occupied in the experimental samples in the field range in which the linear and non - saturating magnetoresistivity was observed.@xcite for the given electron surface density @xmath202 , the number of occupied landau levels , or the filling factor @xmath172 , at different magnetic fields is shown in fig.[rhob](f ) , as well as in the fig.[rhob](d ) and ( e ) , where the integer - number positions of @xmath203 , i.e. filling up to entire @xmath182 landau levels , coincide with the minima of the density - of - states or the dips of sdh oscillation . this is in contrast with @xmath131 case , where the integer number of @xmath203 , which implies a filling up to the center position of the @xmath182th landau levels , locates at a peak of sdh oscillation , as shown in fig.[diffg]b . the observed sdh oscillations in the bi@xmath0se@xmath1 nanoribbon exhibiting nonsaturating surface lmr in the experiment@xcite favor the former case : a finite positive effective @xmath133 .     is plotted as a function of the surface electron density @xmath33 at magnetic field @xmath204 : ( a ) at different values of zero - field mobility @xmath5 , and ( b ) at different values of zero - field conductivity @xmath205.,scaledwidth=40.0% ]     at various lattice temperatures . here the zero - magnetic - field mobility at zero temperature is @xmath206.,scaledwidth=35.0% ]    next , we examine the density - dependence of the linear magnetoresistivity . to compare with abrikosov s quantum magnetoresistance which suggests a @xmath207 behavior,@xcite we show the calculated @xmath208 for above lmr versus the carrier sheet density @xmath33 in fig.[rhon ] at fixed magnetic field @xmath209 t . the mobility is taken respectively to be @xmath210 and @xmath211m@xmath212/vs to make the resistivity in the lmr regime . a clearly linear dependence of @xmath213 on the surface density @xmath33 is seen in all cases , indicating that this non - saturating linear resistivity is almost inversely proportional to the carrier density . in the figure we also show @xmath208 versus @xmath33 under the condition of different given conductivity @xmath214 and @xmath215 . in this case the half - width @xmath216 is independent of surface density . the linear dependence still holds , indicating that this linear behavior is not sensitive to the modest @xmath33-dependence of landau level broadening @xmath216 as long as the system is in the overlapped landau level regime . from the above discussion , it is obvious that lmr shows up in the system having overlapped landau levels and the separation of landau levels makes the mr departure from the linear increase . at high temperature , the thermal energy would smear the level separation and phonon scatterings further broaden landau levels . hence , it is believed that this lmr will be robust against raising temperature . this is indeed the case as seen in fig.[rhot ] , where we plot the calculated magnetoresistivity @xmath157 for the above system with zero - temperature linear mobility @xmath217m@xmath212/vs versus the magnetic field at different lattice temperatures . we can see that raising temperature to room temperature has little effect on the linearity of mr . due to the decreased mobility at higher temperature from phonon scattering , the weak sdh oscillation on the linear background tends to vanish . these features are in good agreement with the experimental report.@xcite in summary , we have studied the two - dimensional magnetotransport in the flat surface of a three - dimensional ti , which arises from the surface states with a wavevector - linear energy dispersion and a finite , positive zeeman splitting within the bulk energy gap . when the level broadening is comparable to or larger than the landau - level separation and the conduction electrons spread over many landau levels , a positive , dominantly linear and non - saturating magnetoresistance appears within a quite wide range of magnetic field and persists up to room temperature . this remarkable lmr provides a possible mechanism for the recently observed linear magnetoresistance in topological insulator bi@xmath0se@xmath1 nanoribbons.@xcite    in contrast to quantum hall effect which appears in the case of well formed landau levels and to abrikosov s quantum magnetotransport,@xcite which is limited to the extreme quantum limit that all electrons coalesce into the lowest landau level , the discussed lmr is a phenomena of pure classical two - dimensional magnetotransport in a system having linear - energy - dispersion , appearing in the regime of overlapped landau levels , irrespective of its showing up in relatively high magnetic field range . furthermore , the present scheme deals with spatially uniform case without invoking the mobility fluctuation in a strongly inhomogeneous system , which is required in the classical parish and littlewood model to produce a lmr.@xcite    the appearance of this significant positive - increasing linear magnetoresistance depends on the existence of a positive and sizable effective g - factor . if the zeeman energy splitting is quite small the resistivity @xmath157 would exhibit little change with changing magnetic field . in the case of a negative and sizable effective g - factor the magnetoresistivity would decrease linearly with increasing magnetic field . therefore , the behavior of the longitudinal resistivity versus magnetic field may provide a useful way for judging the direction and the size of the effective zeeman energy splitting in ti surface states . this work was supported by the national science foundation of china ( grant no . 11104002 ) , the national basic research program of china ( grant no . 2012cb927403 ) and by the program for science&technology innovation talents in universities of henan province ( grant no . 2012hastit029 ) ."""
-
-        dct = tok.batch_encode_plus(
-            [ARTICLE_LEP, ARTICLE_MAGNET],
-            max_length=6144,
-            padding="max_length",
-            truncation=True,
-            return_tensors="pt",
-        )
-
-        hypotheses_batch = hf.generate(
-            input_ids=dct["input_ids"],
-            attention_mask=dct["attention_mask"],
-            num_beams=4,
-            max_length=512,
-            early_stopping=True,
-            no_repeat_ngram_size=3,
-        )
-
-        EXPECTED_LEP = (
-            " the physics of @xmath0-boson will again play the central role in the frontier of particle physics if the"
-            " gigaz option of the international linear collider ( ilc ) can be realized in its first phase. \n the"
-            " expected sensitivity to the branching ratio of rare decays, especially its exotic or rare processes,"
-            " should be investigated comprehensively to evaluate their potential in probing new physics. in this work"
-            " \n, we study the rare decay into light higgs boson(s ) in the framework of the minimal supersymmetric"
-            " standard model ( mssm ), where a light cp - odd higgs - boson with singlet - dominant component may"
-            " naturally arise from the spontaneous breaking of some approximate global symmetry. "
-        )
-
-        EXPECTED_MAGNET = (
-            " the recent experiment in the surface states of the topological insulator bi@xmath0se @xmath1, however,"
-            " reported that a large positive magnetoresistance becomes very linear in perpendicular magnetic field"
-            " even in an opposite situation where the carrier sheet density is high that all electrons occupy more"
-            " than one landau levels. \n it is striking that this observation is in conflict with abrikosov s model"
-            " and also with the classical parish - littlewood model. "
-        )
-
-        generated = tok.batch_decode(
-            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )
-        assert generated == [EXPECTED_LEP, EXPECTED_MAGNET]
\ No newline at end of file
diff --git a/tests/transformers/models/lilt/__init__.py b/tests/transformers/models/lilt/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/lilt/test_modeling_lilt.py b/tests/transformers/models/lilt/test_modeling_lilt.py
deleted file mode 100644
index 521283939..000000000
--- a/tests/transformers/models/lilt/test_modeling_lilt.py
+++ /dev/null
@@ -1,345 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-""" Testing suite for the MindSpore LiLT model. """
-
-import unittest
-import numpy as np
-
-from mindnlp.transformers import LiltConfig
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    slow,
-    require_mindspore,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.transformers import (
-        LiltForQuestionAnswering,
-        LiltForSequenceClassification,
-        LiltForTokenClassification,
-        LiltModel,
-    )
-
-
-class LiltModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=24,
-        num_hidden_layers=2,
-        num_attention_heads=6,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-        range_bbox=1000,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.scope = scope
-        self.range_bbox = range_bbox
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox)
-        # Ensure that bbox is legal
-        for i in range(bbox.shape[0]):
-            for j in range(bbox.shape[1]):
-                if bbox[i, j, 3] < bbox[i, j, 1]:
-                    t = bbox[i, j, 3]
-                    bbox[i, j, 3] = bbox[i, j, 1]
-                    bbox[i, j, 1] = t
-                if bbox[i, j, 2] < bbox[i, j, 0]:
-                    t = bbox[i, j, 2]
-                    bbox[i, j, 2] = bbox[i, j, 0]
-                    bbox[i, j, 0] = t
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor(
-                [self.batch_size, self.seq_length], self.type_vocab_size
-            )
-
-        sequence_labels = None
-        token_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor(
-                [self.batch_size], self.type_sequence_label_size
-            )
-            token_labels = ids_tensor(
-                [self.batch_size, self.seq_length], self.num_labels
-            )
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            bbox,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-        )
-
-    def get_config(self):
-        return LiltConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        bbox,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-    ):
-        model = LiltModel(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            bbox=bbox,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-        )
-        result = model(input_ids, bbox=bbox, token_type_ids=token_type_ids)
-        result = model(input_ids, bbox=bbox)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-        self.parent.assertEqual(
-            result.pooler_output.shape, (self.batch_size, self.hidden_size)
-        )
-
-    def create_and_check_for_token_classification(
-        self,
-        config,
-        input_ids,
-        bbox,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = LiltForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            bbox=bbox,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)
-        )
-
-    def create_and_check_for_question_answering(
-        self,
-        config,
-        input_ids,
-        bbox,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-    ):
-        model = LiltForQuestionAnswering(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            bbox=bbox,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(
-            result.start_logits.shape, (self.batch_size, self.seq_length)
-        )
-        self.parent.assertEqual(
-            result.end_logits.shape, (self.batch_size, self.seq_length)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            bbox,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "bbox": bbox,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class LiltModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            LiltForSequenceClassification,
-            LiltForTokenClassification,
-            LiltForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-
-    fx_compatible = False
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = LiltModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LiltConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "SCUT-DLVCLab/lilt-roberta-en-base"
-        model = LiltModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-@slow
-class LiltModelIntegrationTest(unittest.TestCase):
-    def test_inference_no_head(self):
-        model = LiltModel.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
-
-        input_ids = mindspore.tensor([[1, 2]])
-        bbox = mindspore.tensor(
-            [[[1, 2, 3, 4], [5, 6, 7, 8]]],
-        )
-
-        outputs = model(input_ids=input_ids, bbox=bbox)
-
-        expected_shape = (1, 2, 768)
-        expected_slice = mindspore.tensor(
-            [[-0.0653, 0.0950, -0.0061], [-0.0545, 0.0926, -0.0324]],
-        )
-
-        self.assertTrue(outputs.last_hidden_state.shape, expected_shape)
-        self.assertTrue(
-            np.allclose(
-                outputs.last_hidden_state[0, :, :3].asnumpy(),
-                expected_slice.asnumpy(),
-                atol=1e-3,
-            )
-        )
diff --git a/tests/transformers/models/llama/__init__.py b/tests/transformers/models/llama/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/llama/test_modeling_llama.py b/tests/transformers/models/llama/test_modeling_llama.py
deleted file mode 100644
index 3f6e1ab74..000000000
--- a/tests/transformers/models/llama/test_modeling_llama.py
+++ /dev/null
@@ -1,990 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore LLaMA model."""
-
-import gc
-import unittest
-
-from parameterized import parameterized
-
-from mindnlp.transformers import AutoTokenizer, LlamaConfig, StaticCache, is_mindspore_available
-from mindnlp.engine import set_seed
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, nn, no_grad
-
-    from mindnlp.transformers import (
-        LlamaForCausalLM,
-        LlamaForQuestionAnswering,
-        LlamaForSequenceClassification,
-        LlamaForTokenClassification,
-        LlamaModel,
-        LlamaTokenizer,
-    )
-    from mindnlp.transformers.models.llama.modeling_llama import LlamaLinearScalingRotaryEmbedding, LlamaRotaryEmbedding
-
-
-class LlamaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ops.tril(ops.ones(self.batch_size, self.seq_length))
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return LlamaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = LlamaModel(config=config)
-        model
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = LlamaModel(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = LlamaForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = LlamaForCausalLM(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            LlamaModel,
-            LlamaForCausalLM,
-            LlamaForSequenceClassification,
-            LlamaForQuestionAnswering,
-            LlamaForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (LlamaForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": LlamaModel,
-            "text-classification": LlamaForSequenceClassification,
-            "text-generation": LlamaForCausalLM,
-            "zero-shot": LlamaForSequenceClassification,
-            "question-answering": LlamaForQuestionAnswering,
-            "token-classification": LlamaForTokenClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-    fx_compatible = True
-
-    # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
-    # This is because we are hitting edge cases with the causal_mask buffer
-    model_split_percents = [0.5, 0.7, 0.8]
-
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "meta-llama/Llama-2-7b-hf"
-
-    def setUp(self):
-        self.model_tester = LlamaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LlamaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_llama_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = LlamaForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_llama_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = LlamaForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_llama_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(mindspore.float32)
-        model = LlamaForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_llama_token_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
-        model = LlamaForTokenClassification(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
-        self.assertEqual(
-            result.logits.shape,
-            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
-        )
-
-    @unittest.skip(reason="Llama buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
-    def test_model_rope_scaling_from_config(self, scaling_type):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        short_input = ids_tensor([1, 10], config.vocab_size)
-        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        original_model = LlamaModel(config)
-        original_model.eval()
-        original_short_output = original_model(short_input).last_hidden_state
-        original_long_output = original_model(long_input).last_hidden_state
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
-        scaled_model = LlamaModel(config)
-        scaled_model.eval()
-        scaled_short_output = scaled_model(short_input).last_hidden_state
-        scaled_long_output = scaled_model(long_input).last_hidden_state
-
-        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
-        # maximum sequence length, so the outputs for the short input should match.
-        if scaling_type == "dynamic":
-            self.assertTrue(ops.allclose(original_short_output, scaled_short_output, atol=1e-5))
-        else:
-            self.assertFalse(ops.allclose(original_short_output, scaled_short_output, atol=1e-5))
-
-        # The output should be different for long inputs
-        self.assertFalse(ops.allclose(original_long_output, scaled_long_output, atol=1e-5))
-
-    def test_model_rope_scaling(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        scaling_factor = 10
-        short_input_length = 10
-        long_input_length = int(config.max_position_embeddings * 1.5)
-
-        # Inputs
-        x = ops.randn(1, dtype=mindspore.float32)  # used exlusively to get the dtype and the device
-        position_ids_short = ops.arange(short_input_length, dtype=mindspore.int64)
-        position_ids_short = position_ids_short.unsqueeze(0)
-        position_ids_long = ops.arange(long_input_length, dtype=mindspore.int64)
-        position_ids_long = position_ids_long.unsqueeze(0)
-
-        # Sanity check original RoPE
-        original_rope = LlamaRotaryEmbedding(config=config)
-        original_cos_short, original_sin_short = original_rope(x, position_ids_short)
-        original_cos_long, original_sin_long = original_rope(x, position_ids_long)
-        assert ops.allclose(original_cos_short, original_cos_long[:, :short_input_length, :])
-        assert ops.allclose(original_sin_short, original_sin_long[:, :short_input_length, :])
-
-        # Sanity check linear RoPE scaling
-        # New position "x" should match original position with index "x/scaling_factor"
-        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
-        linear_scaling_rope = LlamaRotaryEmbedding(config=config)
-        linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short)
-        linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long)
-        assert ops.allclose(linear_cos_short, linear_cos_long[:, :short_input_length, :])
-        assert ops.allclose(linear_sin_short, linear_sin_long[:, :short_input_length, :])
-        for new_position in range(0, long_input_length, scaling_factor):
-            original_position = int(new_position // scaling_factor)
-            assert ops.allclose(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :], 1e-3, 1e-3)
-            assert ops.allclose(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :], 1e-3, 1e-3)
-
-        # Sanity check Dynamic NTK RoPE scaling
-        # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase
-        # with scaling_factor (or that `inv_freq` decreases)
-        config.rope_scaling = {"type": "dynamic", "factor": scaling_factor}
-        ntk_scaling_rope = LlamaRotaryEmbedding(config=config)
-        ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short)
-        ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long)
-        assert ops.allclose(ntk_cos_short, original_cos_short, 1e-3, 1e-3)
-        assert ops.allclose(ntk_sin_short, original_sin_short, 1e-3, 1e-3)
-        with self.assertRaises(AssertionError):
-            assert ops.allclose(ntk_cos_long, original_cos_long, 1e-3, 1e-3)
-        with self.assertRaises(AssertionError):
-            assert ops.allclose(ntk_sin_long, original_sin_long, 1e-3, 1e-3)
-        self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
-
-        # Sanity check Yarn RoPE scaling
-        # Scaling should be over the entire input
-        config.rope_scaling = {"type": "yarn", "factor": scaling_factor}
-        yarn_scaling_rope = LlamaRotaryEmbedding(config=config)
-        yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short)
-        yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long)
-        assert ops.allclose(yarn_cos_short, yarn_cos_long[:, :short_input_length, :], 1e-3, 1e-3)
-        assert ops.allclose(yarn_sin_short, yarn_sin_long[:, :short_input_length, :], 1e-3, 1e-3)
-        with self.assertRaises(AssertionError):
-            assert ops.allclose(yarn_cos_short, original_cos_short, 1e-3, 1e-3)
-        with self.assertRaises(AssertionError):
-            assert ops.allclose(yarn_sin_short, original_sin_short, 1e-3, 1e-3)
-        with self.assertRaises(AssertionError):
-            assert ops.allclose(yarn_cos_long, original_cos_long, 1e-3, 1e-3)
-        with self.assertRaises(AssertionError):
-            assert ops.allclose(yarn_sin_long, original_sin_long, 1e-3, 1e-3)
-
-    def test_rope_class_retrocompatibility(self):
-        # Delete me when we remove compatibility for the old API :)
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        scaling_factor = 10
-        short_input_length = 10
-        long_input_length = int(config.max_position_embeddings * 1.5)
-        config.rope_scaling = {"type": "linear", "factor": 10}
-
-        # Inputs
-        x = ops.randn(1, dtype=mindspore.float32)  # used exlusively to get the dtype and the device
-        position_ids_short = ops.arange(short_input_length, dtype=mindspore.int64)
-        position_ids_short = position_ids_short.unsqueeze(0)
-        position_ids_long = ops.arange(long_input_length, dtype=mindspore.int64)
-        position_ids_long = position_ids_long.unsqueeze(0)
-
-        # Old API -- under the hood, "type": "linear" is set and `LlamaRotaryEmbedding` is called
-        old_api_rope = LlamaLinearScalingRotaryEmbedding(
-            config.hidden_size // config.num_attention_heads,
-            max_position_embeddings=config.max_position_embeddings,
-            base=config.rope_theta,
-            scaling_factor=scaling_factor,
-        )
-        old_cos_short, old_sin_short = old_api_rope(x, position_ids_short)
-        old_cos_long, old_sin_long = old_api_rope(x, position_ids_long)
-
-        # New API
-        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
-        new_api_rope = LlamaRotaryEmbedding(config=config)
-        new_cos_short, new_sin_short = new_api_rope(x, position_ids_short)
-        new_cos_long, new_sin_long = new_api_rope(x, position_ids_long)
-
-        # The results should match
-        assert ops.allclose(old_cos_short, new_cos_short)
-        assert ops.allclose(old_sin_short, new_sin_short)
-        assert ops.allclose(old_cos_long, new_cos_long)
-        assert ops.allclose(old_sin_long, new_sin_long)
-
-    def test_model_loading_old_rope_configs(self):
-        def _reinitialize_config(base_config, new_kwargs):
-            # Reinitialize the config with the new kwargs, forcing the config to go through its __init__ validation
-            # steps.
-            base_config_dict = base_config.to_dict()
-            new_config = LlamaConfig.from_dict(config_dict={**base_config_dict, **new_kwargs})
-            return new_config
-
-        # from untouched config -> ✅
-        base_config, model_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        original_model = LlamaForCausalLM(base_config)
-        original_model(**model_inputs)
-
-        # from a config with the expected rope configuration -> ✅
-        config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear", "factor": 10.0}})
-        original_model = LlamaForCausalLM(config)
-        original_model(**model_inputs)
-
-        # from a config with the old rope configuration ('type' instead of 'rope_type')  -> ✅ we gracefully handle BC
-        config = _reinitialize_config(base_config, {"rope_scaling": {"type": "linear", "factor": 10.0}})
-        original_model = LlamaForCausalLM(config)
-        original_model(**model_inputs)
-
-        # from a config with both 'type' and 'rope_type'  -> ✅ they can coexist (and both are present in the config)
-        config = _reinitialize_config(
-            base_config, {"rope_scaling": {"type": "linear", "rope_type": "linear", "factor": 10.0}}
-        )
-        self.assertTrue(config.rope_scaling["type"] == "linear")
-        self.assertTrue(config.rope_scaling["rope_type"] == "linear")
-        original_model = LlamaForCausalLM(config)
-        original_model(**model_inputs)
-
-        # from a config with parameters in a bad range ('factor' should be >= 1.0) -> ⚠️ throws a warning
-        with self.assertLogs("mindnlp.transformers.modeling_rope_utils", level="WARNING") as logs:
-            config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear", "factor": -999.0}})
-            original_model = LlamaForCausalLM(config)
-            original_model(**model_inputs)
-            self.assertEqual(len(logs.output), 1)
-            self.assertIn("factor field", logs.output[0])
-
-        # from a config with unknown parameters ('foo' isn't a rope option) -> ⚠️ throws a warning
-        with self.assertLogs("mindnlp.transformers.modeling_rope_utils", level="WARNING") as logs:
-            config = _reinitialize_config(
-                base_config, {"rope_scaling": {"rope_type": "linear", "factor": 10.0, "foo": "bar"}}
-            )
-            original_model = LlamaForCausalLM(config)
-            original_model(**model_inputs)
-            self.assertEqual(len(logs.output), 1)
-            self.assertIn("Unrecognized keys", logs.output[0])
-
-        # from a config with specific rope type but missing one of its mandatory parameters -> ❌ throws exception
-        with self.assertRaises(KeyError):
-            config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}})  # missing "factor"
-
-
-@require_mindspore
-class LlamaIntegrationTest(unittest.TestCase):
-    # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
-    # Depending on the hardware we get different logits / generations
-    cuda_compute_capability_major_version = None
-
-
-    @slow
-    def test_llama_3_1_hard(self):
-        """
-        An integration test for llama 3.1. It tests against a long output to ensure the subtle numerical differences
-        from llama 3.1.'s RoPE can be detected
-        """
-        # diff on `EXPECTED_TEXT`:
-        # 2024-08-26: updating from torch 2.3.1 to 2.4.0 slightly changes the results.
-        EXPECTED_TEXT = (
-            "Tell me about the french revolution. The french revolution was a period of radical political and social "
-            "upheaval in France that lasted from 1789 until 1799. It was a time of great change and upheaval, marked "
-            "by the overthrow of the monarchy, the rise of the middle class, and the eventual establishment of the "
-            "First French Republic.\nThe revolution began in 1789 with the Estates-General, a representative "
-            "assembly that had not met since 1614. The Third Estate, which represented the common people, "
-            "demanded greater representation and eventually broke away to form the National Assembly. This marked "
-            "the beginning of the end of the absolute monarchy and the rise of the middle class.\n"
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
-        model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Meta-Llama-3.1-8B-Instruct", device_map="auto", ms_dtype=mindspore.bfloat16
-        )
-        input_text = ["Tell me about the french revolution."]
-        model_inputs = tokenizer(input_text, return_tensors="ms")
-
-        generated_ids = model.generate(**model_inputs, max_new_tokens=128, do_sample=False)
-        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(generated_text, EXPECTED_TEXT)
-
-    @slow
-
-    def test_model_7b_logits_bf16(self):
-        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
-
-        model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-7b-hf", device_map="auto", ms_dtype=mindspore.bfloat16, attn_implementation="eager"
-        )
-
-        with no_grad():
-            out = model(mindspore.tensor([input_ids]))
-        # Expected mean on dim = -1
-
-        # fmt: off
-        EXPECTED_MEAN = {
-            7: mindspore.tensor([[-6.5061, -4.1147, -4.9669, -3.2038, 0.8069, -2.9694, 1.2864, -3.3786]]),
-            8: mindspore.tensor([[-6.5208, -4.1218, -4.9377, -3.2536,  0.8127, -2.9811,  1.2918, -3.3848]])
-        }
-
-        self.assertTrue(ops.allclose(EXPECTED_MEAN[self.cuda_compute_capability_major_version], out.logits.mean(-1), atol=1e-2, rtol=1e-2))
-
-        # slicing logits[0, 0, 0:15]
-        EXPECTED_SLICE = {
-            7: mindspore.tensor([[-12.5000, -7.0625, -0.6289, -7.8750, -6.9688, -7.8125, -6.4688, -7.4375, -7.6875, -6.9375, -6.0312, -7.0000, -1.8594, 1.8438, -8.5000]]),
-            8: mindspore.tensor([[-12.5625,  -7.1250,  -0.6289,  -7.8750,  -6.9688,  -7.8125,  -6.5000, -7.4375,  -7.6562,  -6.9688,  -6.0312,  -7.0312,  -1.8203,   1.8750, -8.5000]])
-        }
-        # fmt: on
-
-        self.assertTrue(
-            ops.allclose(
-                EXPECTED_SLICE[self.cuda_compute_capability_major_version],
-                out.logits[0, 0, :15],
-                atol=1e-2,
-                rtol=1e-2,
-            )
-        )
-
-    @slow
-
-    def test_model_7b_logits(self):
-        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
-
-        model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-7b-hf", device_map="auto", ms_dtype=mindspore.float16
-        )
-
-        with no_grad():
-            out = model(mindspore.tensor([input_ids]))
-
-        # fmt: off
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = {
-            7: mindspore.tensor([[-6.6420, -4.1227, -4.9809, -3.2041, 0.8261, -3.0052, 1.2957, -3.3648]]),
-            8: mindspore.tensor([[-6.6544, -4.1259, -4.9840, -3.2456,  0.8261, -3.0124,  1.2971, -3.3641]])
-        }
-
-        self.assertTrue(ops.allclose(EXPECTED_MEAN[self.cuda_compute_capability_major_version], out.logits.mean(-1), atol=1e-2, rtol=1e-2))
-
-        # slicing logits[0, 0, 0:15]
-        EXPECTED_SLICE = {
-            7: mindspore.tensor([-12.8125, -7.3359, -0.4846, -8.0234, -7.2383, -7.9922, -6.4805, -7.7344, -7.8125, -7.0078, -6.1797, -7.1094, -1.8633, 1.9736, -8.6016]),
-            8: mindspore.tensor([-12.8281,  -7.4609,  -0.4668,  -8.0703,  -7.2539,  -8.0078,  -6.4961, -7.7734,  -7.8516,  -7.0352,  -6.2188,  -7.1367,  -1.8564,   1.9922, -8.6328])
-        }
-        # fmt: on
-
-        self.assertTrue(
-            ops.allclose(
-                EXPECTED_SLICE[self.cuda_compute_capability_major_version],
-                out.logits[0, 0, :15],
-                atol=1e-2,
-                rtol=1e-2,
-            )
-        )
-
-    @slow
-    def test_model_7b_dola_generation(self):
-        # ground truth text generated with dola_layers="low", repetition_penalty=1.2
-        EXPECTED_TEXT_COMPLETION = (
-            "Simply put, the theory of relativity states that 1) time and space are relative, and 2) the laws of "
-            "physics are the same for all observers in uniform motion relative to one another.\n\nThe theory of "
-            "relativity was developed by Albert Einstein in the early 20th century, and it revolutionized our "
-            "understanding of space and time."
-        )
-        prompt = "Simply put, the theory of relativity states that "
-        tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-        model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-7b-chat-hf", device_map="sequential", ms_dtype=mindspore.float16
-        )
-        model_inputs = tokenizer(prompt, return_tensors="ms")
-
-        # greedy generation outputs
-        generated_ids = model.generate(
-            **model_inputs, max_new_tokens=64, top_p=None, temperature=1, do_sample=False, dola_layers="low"
-        )
-        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
-
-    @slow
-    @require_mindspore
-    def test_compile_static_cache(self):
-        # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2
-        # work as intended. See https://github.com/pytorch/pytorch/issues/121943
-        NUM_TOKENS_TO_GENERATE = 40
-        # Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test
-        # was changed to have a cache of 53 tokens (as opposed to 4096), on Ampere GPUs.
-        EXPECTED_TEXT_COMPLETION = [
-            "Simply put, the theory of relativity states that 1) the speed of light is constant in all inertial "
-            "reference frames, and 2) the laws of physics are the same for all inertial reference frames.\nThe "
-            "theory of relativ",
-            "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, "
-            "my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p",
-        ]
-
-        prompts = [
-            "Simply put, the theory of relativity states that ",
-            "My favorite all time favorite condiment is ketchup.",
-        ]
-        tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", pad_token="</s>", padding_side="right")
-        model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-7b-hf", device_map="sequential", ms_dtype=mindspore.float16
-        )
-        inputs = tokenizer(prompts, return_tensors="ms", padding=True)
-
-        # Dynamic Cache
-        generated_ids = model.generate(**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False)
-        dynamic_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, dynamic_text)
-
-        # Static Cache
-        generated_ids = model.generate(
-            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
-        )
-        static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, static_text)
-
-        # Static Cache + compile
-        model._cache = None  # clear cache object, initialized when we pass `cache_implementation="static"`
-        # model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
-        # generated_ids = model.generate(
-        #     **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
-        # )
-        # static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-        # self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text)
-
-
-@slow
-class Mask4DTestHard(unittest.TestCase):
-    def tearDown(self):
-        gc.collect()
-
-    def setUp(self):
-        model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-        self.model_dtype = mindspore.float32
-        self.tokenizer = LlamaTokenizer.from_pretrained(model_name)
-        self.model = LlamaForCausalLM.from_pretrained(model_name, ms_dtype=self.model_dtype)
-
-    def get_test_data(self):
-        template = "my favorite {}"
-        items = ("pet is a", "artist plays a", "name is L")  # same number of tokens in each item
-
-        batch_separate = [template.format(x) for x in items]  # 3 separate lines
-        batch_shared_prefix = template.format(" ".join(items))  # 1 line with options concatenated
-
-        input_ids = self.tokenizer(batch_separate, return_tensors="ms").input_ids
-        input_ids_shared_prefix = self.tokenizer(batch_shared_prefix, return_tensors="ms").input_ids
-
-        mask_shared_prefix = mindspore.tensor(
-            [
-                [
-                    [
-                        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                        [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
-                        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
-                        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
-                        [1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
-                        [1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
-                        [1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0],
-                        [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
-                        [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0],
-                        [1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1],
-                    ]
-                ]
-            ],
-        )
-
-        position_ids = ops.arange(input_ids.shape[1]).tile(input_ids.shape[0], 1)
-
-        # building custom positions ids based on custom mask
-        position_ids_shared_prefix = (mask_shared_prefix.sum(dim=-1) - 1).reshape(1, -1)
-        # effectively: position_ids_shared_prefix = mindspore.tensor([[0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5]]).to(device)
-
-        # inverting the mask
-        min_dtype = float(ops.finfo(self.model_dtype).min)
-        mask_shared_prefix = (mask_shared_prefix.eq(0.0)).to(dtype=self.model_dtype) * min_dtype
-
-        return input_ids, position_ids, input_ids_shared_prefix, mask_shared_prefix, position_ids_shared_prefix
-
-    def test_stacked_causal_mask(self):
-        (
-            input_ids,
-            position_ids,
-            input_ids_shared_prefix,
-            mask_shared_prefix,
-            position_ids_shared_prefix,
-        ) = self.get_test_data()
-
-        # regular batch
-        logits = self.model.forward(input_ids, position_ids=position_ids).logits
-        logits_last = logits[:, -1, :]  # last tokens in each batch line
-        decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)]
-
-        # single forward run with 4D custom mask
-        logits_shared_prefix = self.model.forward(
-            input_ids_shared_prefix, attention_mask=mask_shared_prefix, position_ids=position_ids_shared_prefix
-        ).logits
-        logits_shared_prefix_last = logits_shared_prefix[
-            0, ops.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1], :
-        ]  # last three tokens
-        decoded_shared_prefix = [self.tokenizer.decode(t) for t in logits_shared_prefix_last.argmax(dim=-1)]
-
-        self.assertEqual(decoded, decoded_shared_prefix)
-
-    def test_partial_stacked_causal_mask(self):
-        # Same as the test above, but the input is passed in two groups. It tests that we can pass partial 4D attention masks
-
-        (
-            input_ids,
-            position_ids,
-            input_ids_shared_prefix,
-            mask_shared_prefix,
-            position_ids_shared_prefix,
-        ) = self.get_test_data()
-
-        # regular batch
-        logits = self.model.forward(input_ids, position_ids=position_ids).logits
-        logits_last = logits[:, -1, :]  # last tokens in each batch line
-        decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)]
-
-        # 2 forward runs with custom 4D masks
-        part_a = 3  # split point
-
-        input_1a = input_ids_shared_prefix[:, :part_a]
-        position_ids_1a = position_ids_shared_prefix[:, :part_a]
-        mask_1a = mask_shared_prefix[:, :, :part_a, :part_a]
-
-        outs_1a = self.model.forward(input_1a, attention_mask=mask_1a, position_ids=position_ids_1a)
-        past_key_values_a = outs_1a["past_key_values"]
-
-        # Case 1: we pass a 4D attention mask regarding the current sequence length (i.e. [..., seq_len, full_len])
-        input_1b = input_ids_shared_prefix[:, part_a:]
-        position_ids_1b = position_ids_shared_prefix[:, part_a:]
-        mask_1b = mask_shared_prefix[:, :, part_a:, :]
-        outs_1b = self.model.forward(
-            input_1b,
-            attention_mask=mask_1b,
-            position_ids=position_ids_1b,
-            past_key_values=past_key_values_a,
-        )
-        decoded_1b = [
-            self.tokenizer.decode(t)
-            for t in outs_1b.logits.argmax(-1)[
-                0, ops.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1] - part_a
-            ]
-        ]
-        self.assertEqual(decoded, decoded_1b)
-
-    def test_stacked_causal_mask_static_cache(self):
-        """same as above but with StaticCache"""
-        (
-            input_ids,
-            position_ids,
-            input_ids_shared_prefix,
-            mask_shared_prefix,
-            position_ids_shared_prefix,
-        ) = self.get_test_data()
-
-        # regular batch
-        logits = self.model.forward(input_ids, position_ids=position_ids).logits
-        logits_last = logits[:, -1, :]  # last tokens in each batch line
-        decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)]
-
-        # upgrade the model with StaticCache
-        max_cache_len = 16  # note that max_cache_len is greater than the attention_mask.shape[-1]
-        past_key_values = StaticCache(
-            config=self.model.config,
-            batch_size=1,
-            max_cache_len=max_cache_len,
-            dtype=self.model.dtype,
-        )
-
-        padded_attention_mask = nn.functional.pad(
-            input=mask_shared_prefix,
-            pad=(0, max_cache_len - mask_shared_prefix.shape[-1]),
-            mode="constant",
-            value=float(ops.finfo(self.model_dtype).min),
-        )
-
-        # single forward run with 4D custom mask
-        logits_shared_prefix = self.model.forward(
-            input_ids_shared_prefix,
-            attention_mask=padded_attention_mask,
-            position_ids=position_ids_shared_prefix,
-            cache_position=ops.arange(input_ids_shared_prefix.shape[-1]),
-            past_key_values=past_key_values,
-        ).logits
-        logits_shared_prefix_last = logits_shared_prefix[
-            0, ops.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1], :
-        ]  # last three tokens
-        decoded_shared_prefix = [self.tokenizer.decode(t) for t in logits_shared_prefix_last.argmax(dim=-1)]
-
-        self.assertEqual(decoded, decoded_shared_prefix)
-
-    def test_partial_stacked_causal_mask_static_cache(self):
-        # Same as the test above, but the input is passed in two groups. It tests that we can pass partial 4D attention masks
-        # we pass a 4D attention mask shaped [..., seq_len, full_static_cache_len])
-        (
-            input_ids,
-            position_ids,
-            input_ids_shared_prefix,
-            mask_shared_prefix,
-            position_ids_shared_prefix,
-        ) = self.get_test_data()
-
-        # regular batch
-        logits = self.model.forward(input_ids, position_ids=position_ids).logits
-        logits_last = logits[:, -1, :]  # last tokens in each batch line
-        decoded = [self.tokenizer.decode(t) for t in logits_last.argmax(dim=-1)]
-
-        # upgrade the model with StaticCache
-        max_cache_len = 16  # note that max_cache_len is greater than the attention_mask.shape[-1]
-        past_key_values = StaticCache(
-            config=self.model.config,
-            batch_size=1,
-            max_cache_len=max_cache_len,
-            dtype=self.model.dtype,
-        )
-
-        # forward run for the first part of input
-        part_a = 3  # split point
-
-        input_1a = input_ids_shared_prefix[:, :part_a]
-        position_ids_1a = position_ids_shared_prefix[:, :part_a]
-        mask_1a = mask_shared_prefix[:, :, :part_a, :part_a]
-
-        padded_mask_1a = nn.functional.pad(
-            input=mask_1a,
-            pad=(0, max_cache_len - mask_1a.shape[-1]),
-            mode="constant",
-            value=float(ops.finfo(self.model_dtype).min),
-        )
-
-        _ = self.model.forward(
-            input_1a,
-            attention_mask=padded_mask_1a,
-            position_ids=position_ids_1a,
-            cache_position=ops.arange(part_a),
-            past_key_values=past_key_values,
-        )
-
-        # forward run for the second part of input
-        input_1b = input_ids_shared_prefix[:, part_a:]
-        position_ids_1b = position_ids_shared_prefix[:, part_a:]
-        mask_1b = mask_shared_prefix[:, :, part_a:, :]
-
-        padded_mask_1b = nn.functional.pad(
-            input=mask_1b, pad=(0, max_cache_len - mask_1b.shape[-1]), mode="constant", value=0
-        )
-
-        outs_1b = self.model.forward(
-            input_1b,
-            attention_mask=padded_mask_1b,
-            position_ids=position_ids_1b,
-            cache_position=ops.arange(
-                part_a,
-                input_ids_shared_prefix.shape[-1],
-            ),
-            past_key_values=past_key_values,
-        )
-        decoded_1b = [
-            self.tokenizer.decode(t)
-            for t in outs_1b.logits.argmax(-1)[
-                0, ops.where(position_ids_shared_prefix == position_ids_shared_prefix.max())[1] - part_a
-            ]
-        ]
-        self.assertEqual(decoded, decoded_1b)
\ No newline at end of file
diff --git a/tests/transformers/models/llava/__init__.py b/tests/transformers/models/llava/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/llava/test_modeling_llava.py b/tests/transformers/models/llava/test_modeling_llava.py
deleted file mode 100644
index 5e5b19654..000000000
--- a/tests/transformers/models/llava/test_modeling_llava.py
+++ /dev/null
@@ -1,677 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================
-"""Testing suite for the MindSpore Llava model."""
-
-import copy
-import gc
-import unittest
-
-import requests
-
-from mindnlp.transformers import (
-    AutoProcessor,
-    AutoTokenizer,
-    LlavaConfig,
-    LlavaForConditionalGeneration,
-)
-from mindnlp.utils import is_mindspore_available, is_vision_available
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-if is_vision_available():
-    from PIL import Image
-
-class LlavaVisionText2TextModelTester:
-    def __init__(
-        self,
-        parent,
-        ignore_index=-100,
-        image_token_index=0,
-        projector_hidden_act="gelu",
-        seq_length=7,
-        vision_feature_select_strategy="default",
-        vision_feature_layer=-1,
-        text_config={
-            "model_type": "llama",
-            "seq_length": 7,
-            "is_training": True,
-            "use_input_mask": True,
-            "use_token_type_ids": False,
-            "use_labels": True,
-            "vocab_size": 99,
-            "hidden_size": 32,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "intermediate_size": 37,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "attention_probs_dropout_prob": 0.1,
-            "max_position_embeddings": 512,
-            "type_vocab_size": 16,
-            "type_sequence_label_size": 2,
-            "initializer_range": 0.02,
-            "num_labels": 3,
-            "num_choices": 4,
-            "pad_token_id": 0,
-        },
-        is_training=True,
-        vision_config={
-            "image_size": 30,
-            "patch_size": 2,
-            "num_channels": 3,
-            "is_training": True,
-            "hidden_size": 32,
-            "projection_dim": 32,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "intermediate_size": 37,
-            "dropout": 0.1,
-            "attention_dropout": 0.1,
-            "initializer_range": 0.02,
-        },
-    ):
-        self.parent = parent
-        self.ignore_index = ignore_index
-        self.image_token_index = image_token_index
-        self.projector_hidden_act = projector_hidden_act
-        self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.vision_feature_layer = vision_feature_layer
-        self.text_config = text_config
-        self.vision_config = vision_config
-        self.seq_length = seq_length
-
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.vocab_size = text_config["vocab_size"]
-        self.hidden_size = text_config["hidden_size"]
-        self.num_attention_heads = text_config["num_attention_heads"]
-        self.is_training = is_training
-
-        self.batch_size = 3
-        self.num_channels = 3
-        self.image_size = 336
-        self.encoder_seq_length = 231
-
-    def get_config(self):
-        return LlavaConfig(
-            text_config=self.text_config,
-            vision_config=self.vision_config,
-            ignore_index=self.ignore_index,
-            image_token_index=self.image_token_index,
-            projector_hidden_act=self.projector_hidden_act,
-            vision_feature_select_strategy=self.vision_feature_select_strategy,
-            vision_feature_layer=self.vision_feature_layer,
-        )
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [
-                self.batch_size,
-                self.vision_config["num_channels"],
-                self.vision_config["image_size"],
-                self.vision_config["image_size"],
-            ]
-        )
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        input_ids = ids_tensor(
-            [self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
-        attention_mask = input_ids.ne(1)
-        # we are giving 3 images let's make sure we pass in 3 image tokens
-        input_ids[:, 1] = config.image_token_index
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Model tester for `LlavaForConditionalGeneration`.
-    """
-
-    all_model_classes = (LlavaForConditionalGeneration,
-                         ) if is_mindspore_available() else ()
-    pipeline_model_mapping = {
-        "image-to-text": LlavaForConditionalGeneration} if is_mindspore_available() else {}
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = LlavaVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=LlavaConfig, has_text_modality=False)
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            wte = model.get_input_embeddings()
-            inputs["inputs_embeds"] = wte(input_ids)
-
-            with no_grad():
-                model(**inputs)
-
-    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
-    # while some other models require pixel_values to be present
-    def test_inputs_embeds_matches_input_ids(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-
-            with no_grad():
-                out_ids = model(input_ids=input_ids, **inputs)[0]
-                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
-            self.assertTrue(ops.allclose(out_embeds, out_ids))
-
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_resize_tokens_embeddings with config.vocab_size->config.text_config.vocab_size
-    def test_resize_tokens_embeddings(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            return
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            if self.model_tester.is_training is False:
-                model.set_train(False)
-
-            model_vocab_size = config.text_config.vocab_size
-            # Retrieve the embeddings and clone theme
-            model_embed = model.resize_token_embeddings(model_vocab_size)
-            cloned_embeddings = model_embed.weight.clone()
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(
-                model.config.text_config.vocab_size, model_vocab_size + 10)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(
-                model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(
-                model.config.text_config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(
-                model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["input_ids"] = inputs_dict["input_ids"].clamp(max=model_vocab_size - 15 - 1)
-
-            # make sure that decoder_input_ids are resized as well
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = inputs_dict["decoder_input_ids"].clamp(
-                    max=model_vocab_size - 15 - 1)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                if p1.ne(p2).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            model_vocab_size = config.text_config.vocab_size
-            model.resize_token_embeddings(
-                model_vocab_size + 10, pad_to_multiple_of=1)
-            self.assertTrue(
-                model.config.text_config.vocab_size + 10, model_vocab_size)
-
-            model_embed = model.resize_token_embeddings(
-                model_vocab_size, pad_to_multiple_of=64)
-            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
-
-            self.assertTrue(
-                model_embed.weight.shape[0], model.config.text_config.vocab_size)
-            self.assertTrue(
-                model.config.text_config.vocab_size, model.vocab_size)
-
-            model_embed = model.resize_token_embeddings(
-                model_vocab_size + 13, pad_to_multiple_of=64)
-            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
-
-            # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
-            target_dimension = 128
-            model_embed = model.resize_token_embeddings(
-                target_dimension, pad_to_multiple_of=64)
-            self.assertTrue(model_embed.weight.shape[0], target_dimension)
-
-            with self.assertRaisesRegex(
-                ValueError,
-                "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
-            ):
-                model.resize_token_embeddings(
-                    model_vocab_size, pad_to_multiple_of=1.3)
-
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_resize_embeddings_untied with config.vocab_size->config.text_config.vocab_size
-    def test_resize_embeddings_untied(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            return
-
-        original_config.tie_word_embeddings = False
-
-        # if model cannot untied embeddings -> leave test
-        if original_config.tie_word_embeddings:
-            return
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            # if no output embeddings -> leave test
-            if model.get_output_embeddings() is None:
-                continue
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_vocab_size = config.text_config.vocab_size
-            model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(
-                model.config.text_config.vocab_size, model_vocab_size + 10)
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(
-                output_embeds.weight.shape[0], model_vocab_size + 10)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(
-                    output_embeds.bias.shape[0], model_vocab_size + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(
-                model.config.text_config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(
-                output_embeds.weight.shape[0], model_vocab_size - 15)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(
-                    output_embeds.bias.shape[0], model_vocab_size - 15)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["input_ids"] = inputs_dict["input_ids"].clamp(max=model_vocab_size - 15 - 1)
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = inputs_dict["decoder_input_ids"].clamp(
-                    max=model_vocab_size - 15 - 1)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_tie_model_weights with config.vocab_size->config.text_config.vocab_size
-    def test_tie_model_weights(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_same_values(layer_1, layer_2):
-            equal = True
-            for p1, p2 in zip(layer_1.weight, layer_2.weight):
-                if p1.data.ne(p2.data).sum() > 0:
-                    equal = False
-            return equal
-
-        for model_class in self.all_model_classes:
-            config.torchscript = True
-            model_not_tied = model_class(config)
-            if model_not_tied.get_output_embeddings() is None:
-                continue
-
-            config_tied = copy.deepcopy(config)
-            config_tied.torchscript = False
-            model_tied = model_class(config_tied)
-            params_tied = list(model_tied.get_parameters())
-            # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # Check that after resize they remain tied.
-            model_tied.resize_token_embeddings(
-                config.text_config.vocab_size + 10)
-            params_tied_2 = list(model_tied.get_parameters())
-            self.assertEqual(len(params_tied_2), len(params_tied))
-
-
-@require_mindspore
-class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
-    def setUp(self):
-        self.processor = AutoProcessor.from_pretrained(
-            "llava-hf/bakLlava-v1-hf")
-
-    def tearDown(self):
-        gc.collect()
-
-    @slow
-    def test_small_model_integration_test(self):
-        # Let' s make sure we test the preprocessing to replace what is used
-        model = LlavaForConditionalGeneration.from_pretrained(
-            "llava-hf/bakLlava-v1-hf", load_in_4bit=True)
-
-        prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
-        image_file = "https://llava-vl.github.io/static/images/view.jpg"
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = self.processor(prompt, raw_image, return_tensors="ms")
-
-        EXPECTED_INPUT_IDS = mindspore.Tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722, 315, 1023,
-                                       347, 13831, 925, 684, 739, 315, 3251, 456, 1633, 28804, 13, 4816, 8048, 12738, 28747]])  # fmt: skip
-        self.assertTrue(ops.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
-
-        output = model.generate(**inputs, max_new_tokens=20)
-        EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly,"  # fmt: skip
-
-        self.assertEqual(
-            self.processor.decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    def test_small_model_integration_test_llama(self):
-        # Let' s make sure we test the preprocessing to replace what is used
-        model_id = "llava-hf/llava-1.5-7b-hf"
-
-        model = LlavaForConditionalGeneration.from_pretrained(
-            "llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit this place? ASSISTANT:"
-        image_file = "https://llava-vl.github.io/static/images/view.jpg"
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = processor(prompt, raw_image,
-                           return_tensors="ms").astype(mindspore.float16)
-
-        output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
-        EXPECTED_DECODED_TEXT = "USER:  \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area."  # fmt: skip
-
-        self.assertEqual(
-            processor.decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    def test_small_model_integration_test_llama_batched(self):
-        # Let' s make sure we test the preprocessing to replace what is used
-        model_id = "llava-hf/llava-1.5-7b-hf"
-
-        model = LlavaForConditionalGeneration.from_pretrained(
-            "llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        prompts = [
-            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT:",
-            "USER: <image>\nWhat is this? ASSISTANT:",
-        ]
-        image1 = Image.open(requests.get(
-            "https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
-        image2 = Image.open(requests.get(
-            "http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
-
-        inputs = processor(
-            prompts, images=[image1, image2], return_tensors="ms", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20)
-
-        EXPECTED_DECODED_TEXT = ['USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, you',
-                                 'USER:  \nWhat is this? ASSISTANT: The image features two cats lying down on a pink couch. One cat is located on']  # fmt: skip
-
-        self.assertEqual(processor.batch_decode(
-            output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
-
-    @slow
-    def test_small_model_integration_test_batch(self):
-        # Let' s make sure we test the preprocessing to replace what is used
-        model = LlavaForConditionalGeneration.from_pretrained(
-            "llava-hf/bakLlava-v1-hf", load_in_4bit=True)
-        # The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
-        prompts = [
-            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
-            "USER: <image>\nWhat is this?\nASSISTANT:",
-        ]
-        image1 = Image.open(requests.get(
-            "https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
-        image2 = Image.open(requests.get(
-            "http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
-
-        inputs = self.processor(
-            prompts, images=[image1, image2], return_tensors="ms", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20)
-
-        EXPECTED_DECODED_TEXT = ['USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring along',
-                                 'USER:  \nWhat is this?\nASSISTANT: Cats']  # fmt: skip
-        self.assertEqual(self.processor.batch_decode(
-            output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
-
-    @slow
-    def test_small_model_integration_test_llama_batched_regression(self):
-        # Let' s make sure we test the preprocessing to replace what is used
-        model_id = "llava-hf/llava-1.5-7b-hf"
-
-        # Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
-        model = LlavaForConditionalGeneration.from_pretrained(
-            "llava-hf/llava-1.5-7b-hf", load_in_4bit=True, attn_implementation="eager"
-        )
-        processor = AutoProcessor.from_pretrained(model_id, pad_token="<pad>")
-
-        prompts = [
-            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
-            "USER: <image>\nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: <image>\nAnd this?\nASSISTANT:",
-        ]
-        image1 = Image.open(requests.get(
-            "https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
-        image2 = Image.open(requests.get(
-            "http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
-
-        inputs = processor(
-            prompts, images=[image1, image2, image1], return_tensors="ms", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20)
-
-        EXPECTED_DECODED_TEXT = ['USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, which appears to be a dock or pier extending over a body of water',
-                                 'USER:  \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER:  \nAnd this?\nASSISTANT: A cat sleeping on a bed.']  # fmt: skip
-
-        self.assertEqual(processor.batch_decode(
-            output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
-
-    @slow
-    @require_mindspore
-    @require_vision
-    def test_batched_generation(self):
-        model = LlavaForConditionalGeneration.from_pretrained(
-            "llava-hf/llava-1.5-7b-hf")
-
-        processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
-
-        prompt1 = "<image>\n<image>\nUSER: What's the the difference of two images?\nASSISTANT:"
-        prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
-        prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
-        url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
-        url2 = "https://images.unsplash.com/photo-1617258683320-61900b281ced?q=80&w=3087&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
-        image1 = Image.open(requests.get(url1, stream=True).raw)
-        image2 = Image.open(requests.get(url2, stream=True).raw)
-
-        inputs = processor(
-            text=[prompt1, prompt2, prompt3],
-            images=[image1, image2, image1, image2],
-            return_tensors="ms",
-            padding=True,
-        )
-
-        model = model.set_train(False)
-
-        EXPECTED_OUTPUT = [
-            "\n \nUSER: What's the the difference of two images?\nASSISTANT: In the two images, the primary difference is the presence of a small dog holding a flower in one",
-            "\nUSER: Describe the image.\nASSISTANT: The image features a small, fluffy dog sitting on a sidewalk. The dog is holding",
-            "\nUSER: Describe the image.\nASSISTANT: The image features a lone, adult llama standing on a grassy hill. The llama",
-        ]
-
-        generate_ids = model.generate(**inputs, max_new_tokens=20)
-        outputs = processor.batch_decode(
-            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        self.assertEqual(outputs, EXPECTED_OUTPUT)
-
-    @slow
-    def test_llava_index_error_bug(self):
-        # This is a reproducer of https://github.com/huggingface/transformers/pull/28032 and makes sure it does not happen anymore
-        # Please refer to that PR, or specifically https://github.com/huggingface/transformers/pull/28032#issuecomment-1860650043 for
-        # more details
-        model_id = "llava-hf/llava-1.5-7b-hf"
-        model = LlavaForConditionalGeneration.from_pretrained(
-            model_id, load_in_4bit=True)
-
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        # Simulate a super long prompt
-        user_prompt = "Describe the image:?\n" * 200
-        prompt = f"USER: <image>\n{user_prompt}ASSISTANT:"
-        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
-
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = processor(prompt, raw_image,
-                           return_tensors="ms").astype(mindspore.float16)
-
-        # Make sure that `generate` works
-        _ = model.generate(**inputs, max_new_tokens=20)
-
-    @slow
-    def test_llava_merge_inputs_error_bug(self):
-        # This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
-        model_id = "llava-hf/llava-1.5-7b-hf"
-        model = LlavaForConditionalGeneration.from_pretrained(
-            model_id
-        )
-
-        # Simulate some user inputs
-        pixel_values = ops.randn(
-            (2, 3, 336, 336),
-            dtype=mindspore.float32,
-        )
-        input_ids = mindspore.Tensor(
-            [
-                [32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
-                [1, 15043, 7084, 29901, 29871, 32000, 29871, 13, 7900],
-            ],
-            dtype=mindspore.int64,
-        )
-        attention_mask = mindspore.Tensor(
-            [[0, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]],
-            dtype=mindspore.int64,
-        )
-
-        # Make sure that the loss is properly computed
-        loss = model(
-            pixel_values=pixel_values,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            labels=input_ids,
-        ).loss
-        loss.backward()
-
-    def test_tokenizer_integration(self):
-        slow_tokenizer = AutoTokenizer.from_pretrained(
-            "liuhaotian/llava-v1.6-34b", use_fast=False)
-        slow_tokenizer.add_tokens("<image>", True)
-
-        fast_tokenizer = AutoTokenizer.from_pretrained(
-            "liuhaotian/llava-v1.6-34b",
-            bos_token="<|startoftext|>",
-            eos_token="<|endoftext|>",
-            from_slow=True,
-            legacy=False,
-        )
-        fast_tokenizer.add_tokens("<image>", True)
-
-        prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
-        # If the token is added as special, it's not normalized, and the only diff is the extra space after special tokens.
-        # https://github.com/huggingface/transformers/pull/28881 is the fix for this.
-        self.assertEqual(
-            slow_tokenizer.tokenize(prompt),
-            ['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '<image>',
-                '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n']
-        )  # fmt: skip
-
-        self.assertEqual(
-            fast_tokenizer.tokenize(prompt),
-            ['<|im_start|>', '▁system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', '▁user', '\n', '<image>',
-                '▁', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', '▁assistant', '\n']
-        )  # fmt: skip
diff --git a/tests/transformers/models/llava_next/__init__.py b/tests/transformers/models/llava_next/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/llava_next/test_image_processor_llava_next.py b/tests/transformers/models/llava_next/test_image_processor_llava_next.py
deleted file mode 100644
index 5ef5a63a7..000000000
--- a/tests/transformers/models/llava_next/test_image_processor_llava_next.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================
-
-import unittest
-
-import numpy as np
-
-from mindnlp.configs import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-from mindnlp.transformers.models.llava_next.image_processing_llava_next import select_best_resolution
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_mindspore_available():
-    import mindspore as ms
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import LlavaNextImageProcessor
-
-
-class LlavaNextImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=OPENAI_CLIP_MEAN,
-        image_std=OPENAI_CLIP_STD,
-        do_convert_rgb=True,
-    ):
-        size = size if size is not None else {"shortest_edge": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_convert_rgb = do_convert_rgb
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_convert_rgb": self.do_convert_rgb,
-        }
-
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
-
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class LlavaNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = LlavaNextImageProcessor if is_vision_available() else None
-
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->LlavaNext
-    def setUp(self):
-        self.image_processor_tester = LlavaNextImageProcessingTester(self)
-
-    @property
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_center_crop"))
-        self.assertTrue(hasattr(image_processing, "center_crop"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
-        self.assertTrue(hasattr(image_processing, "image_grid_pinpoints"))
-
-    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.test_image_processor_from_dict_with_kwargs
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 20})
-        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
-        self.assertEqual(image_processor.size, {"shortest_edge": 42})
-        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
-
-    def test_select_best_resolution(self):
-        possible_resolutions = [[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]
-
-        # Test with a square aspect ratio
-        best_resolution = select_best_resolution((336, 336), possible_resolutions)
-        self.assertEqual(best_resolution, (672, 336))
-
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="ms").pixel_values
-        expected_output_image_shape = (1, 1445, 3, 18, 18)
-        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="ms").pixel_values
-        expected_output_image_shape = (7, 1445, 3, 18, 18)
-        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
-
-    def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="ms").pixel_values
-        expected_output_image_shape = (1, 1445, 3, 18, 18)
-        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="ms").pixel_values
-        expected_output_image_shape = (7, 1445, 3, 18, 18)
-        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
-
-    def test_call_pytorch(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
-
-        for image in image_inputs:
-            self.assertIsInstance(image, ms.Tensor)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="ms").pixel_values
-        expected_output_image_shape = (1, 1445, 3, 18, 18)
-        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="ms").pixel_values
-        expected_output_image_shape = (7, 1445, 3, 18, 18)
-        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
-
-    @unittest.skip("LlavaNextImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
-    def test_call_numpy_4_channels(self):
-        pass
diff --git a/tests/transformers/models/llava_next/test_modeling_llava_next.py b/tests/transformers/models/llava_next/test_modeling_llava_next.py
deleted file mode 100644
index b42451751..000000000
--- a/tests/transformers/models/llava_next/test_modeling_llava_next.py
+++ /dev/null
@@ -1,588 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Llava-NeXT model."""
-
-import gc
-import unittest
-
-import requests
-from huggingface_hub import hf_hub_download
-
-from mindnlp.transformers import (
-    AutoProcessor,
-    LlavaNextConfig,
-    LlavaNextForConditionalGeneration
-)
-from mindnlp.utils import (
-    is_mindspore_available,
-    is_vision_available,
-)
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-)
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers.models.llava_next.modeling_llava_next import image_size_to_num_patches
-else:
-    is_torch_greater_or_equal_than_2_0 = False
-
-if is_vision_available():
-    from PIL import Image
-
-
-class LlavaNextVisionText2TextModelTester:
-    def __init__(
-        self,
-        parent,
-        ignore_index=-100,
-        image_token_index=0,
-        projector_hidden_act="gelu",
-        seq_length=7,
-        vision_feature_select_strategy="default",
-        vision_feature_layer=-1,
-        text_config={
-            "model_type": "llama",
-            "seq_length": 7,
-            "is_training": True,
-            "use_input_mask": True,
-            "use_token_type_ids": False,
-            "use_labels": True,
-            "vocab_size": 99,
-            "hidden_size": 32,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "intermediate_size": 37,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "attention_probs_dropout_prob": 0.1,
-            "max_position_embeddings": 580,
-            "type_vocab_size": 16,
-            "type_sequence_label_size": 2,
-            "initializer_range": 0.02,
-            "num_labels": 3,
-            "num_choices": 4,
-            "pad_token_id": 0,
-        },
-        is_training=True,
-        vision_config={
-            "image_size": 16,
-            "patch_size": 2,
-            "num_channels": 3,
-            "is_training": True,
-            "hidden_size": 32,
-            "projection_dim": 32,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "intermediate_size": 37,
-            "dropout": 0.1,
-            "attention_dropout": 0.1,
-            "initializer_range": 0.02,
-        },
-    ):
-        self.parent = parent
-        self.ignore_index = ignore_index
-        self.image_token_index = image_token_index
-        self.projector_hidden_act = projector_hidden_act
-        self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.vision_feature_layer = vision_feature_layer
-        self.text_config = text_config
-        self.vision_config = vision_config
-        self.seq_length = seq_length
-
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.vocab_size = text_config["vocab_size"]
-        self.hidden_size = text_config["hidden_size"]
-        self.num_attention_heads = text_config["num_attention_heads"]
-        self.is_training = is_training
-
-        self.batch_size = 3
-        self.num_channels = 3
-        self.image_size = 30
-        self.encoder_seq_length = 342
-        self.image_grid_pinpoints = [[32, 32]]
-
-    def get_config(self):
-        return LlavaNextConfig(
-            text_config=self.text_config,
-            vision_config=self.vision_config,
-            ignore_index=self.ignore_index,
-            image_token_index=self.image_token_index,
-            projector_hidden_act=self.projector_hidden_act,
-            vision_feature_select_strategy=self.vision_feature_select_strategy,
-            vision_feature_layer=self.vision_feature_layer,
-            image_grid_pinpoints=self.image_grid_pinpoints,
-        )
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [
-                self.batch_size,
-                5,
-                self.vision_config["num_channels"],
-                self.vision_config["image_size"],
-                self.vision_config["image_size"],
-            ]
-        )
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
-        attention_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-        # we are giving 3 images let's make sure we pass in 3 image tokens
-        input_ids[:, 1] = config.image_token_index
-        labels = ops.zeros((self.batch_size, self.seq_length), dtype=mindspore.int64)
-        # maskout where the image token is
-        labels[:, 1] == self.ignore_index
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "image_sizes": mindspore.tensor(
-                [[self.vision_config["image_size"], self.vision_config["image_size"]]] * self.batch_size
-            ),
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "labels": labels,
-        }
-        return config, inputs_dict
-
-    def create_and_check_llava_next_model_fp16_forward(
-        self, config, input_ids, pixel_values, attention_mask, image_sizes
-    ):
-        model = LlavaNextForConditionalGeneration(config=config)
-        model.half()
-        model.eval()
-        logits = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            image_sizes=image_sizes,
-            pixel_values=pixel_values.to(mindspore.bfloat16),
-            return_dict=True,
-        )["logits"]
-        self.parent.assertFalse(ops.isnan(logits).any().item())
-
-    # def create_and_check_llava_next_model_fp16_autocast_forward(
-    #     self, config, input_ids, pixel_values, attention_mask, image_sizes
-    # ):
-    #     config.torch_dtype = mindspore.float16
-    #     model = LlavaNextForConditionalGeneration(config=config)
-    #     model.eval()
-    #     with ops.autocast(device_type="cuda", dtype=mindspore.float16):
-    #         logits = model(
-    #             input_ids=input_ids,
-    #             attention_mask=attention_mask,
-    #             image_sizes=image_sizes,
-    #             pixel_values=pixel_values.to(ops.bfloat16),
-    #             return_dict=True,
-    #         )["logits"]
-    #     self.parent.assertFalse(ops.isnan(logits).any().item())
-
-
-@require_mindspore
-class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    """
-    Model tester for `LlavaNextForConditionalGeneration`.
-    """
-
-    all_model_classes = (LlavaNextForConditionalGeneration,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = LlavaNextVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LlavaNextConfig, has_text_modality=False)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "image_newline" in name:
-                    continue
-                elif param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            wte = model.get_input_embeddings()
-            inputs["inputs_embeds"] = wte(input_ids)
-
-            with no_grad():
-                model(**inputs)
-
-    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
-    # while some other models require pixel_values to be present
-    def test_inputs_embeds_matches_input_ids(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-
-            with no_grad():
-                out_ids = model(input_ids=input_ids, **inputs)[0]
-                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
-            self.assertTrue(ops.allclose(out_embeds, out_ids))
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="Feedforward chunking is not yet supported")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip(reason="CPU offload is not yet supported")
-    def test_cpu_offload(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported because in LLava models")
-    def test_sdpa_can_compile_dynamic(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported because in LLava models")
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
-
-@require_mindspore
-class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
-    def setUp(self):
-        self.processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
-        url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
-        self.image = Image.open(requests.get(url, stream=True).raw)
-
-        self.prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
-
-    def tearDown(self):
-        gc.collect()
-
-    # @slow
-    # @require_bitsandbytes
-    # def test_small_model_integration_test(self):
-    #     model = LlavaNextForConditionalGeneration.from_pretrained(
-    #         "llava-hf/llava-v1.6-mistral-7b-hf",
-    #         load_in_4bit=True,
-    #     )
-
-    #     inputs = self.processor(self.prompt, self.image, return_tensors="ms")
-
-    #     # verify inputs against original implementation
-    #     filepath = hf_hub_download(
-    #         repo_id="nielsr/test-image",
-    #         filename="llava_1_6_input_ids.pt",
-    #         repo_type="dataset",
-    #     )
-    #     original_input_ids = load(filepath, map_location="cpu")
-    #     # replace -200 by image_token_index (since we use token ID = 32000 for the image token)
-    #     original_input_ids[original_input_ids == -200] = model.config.image_token_index
-    #     assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist()
-
-    #     filepath = hf_hub_download(
-    #         repo_id="nielsr/test-image",
-    #         filename="llava_1_6_pixel_values.pt",
-    #         repo_type="dataset",
-    #     )
-    #     original_pixel_values = load(filepath, map_location="cpu")
-    #     assert ops.allclose(original_pixel_values, inputs.pixel_values.half())
-
-    #     # verify single forward pass
-    #     with no_grad():
-    #         output = model(**inputs)
-
-    #     expected_slice = mindspore.tensor(
-    #         [
-    #             [-4.7695, -4.5664, -0.2786],
-    #             [-10.6250, -10.8906, -2.5254],
-    #             [-6.7383, -7.2461, -0.6787],
-    #         ],
-    #         dtype=mindspore.float32,
-    #     )
-    #     assert ops.allclose(output.logits[0, :3, :3], expected_slice, atol=1e-3)
-
-    #     # verify generation
-    #     output = model.generate(**inputs, max_new_tokens=100)
-    #     EXPECTED_DECODED_TEXT = '[INST]  \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays values for multiple quantitative variables represented on axes starting from the same point. This particular radar chart is showing the performance of various models or systems across different metrics or datasets.\n\nThe chart is divided into several sections, each representing a different model or dataset. The axes represent different metrics or datasets, such as "MMM-Vet," "MMM-Bench," "L'  # fmt: skip
-
-    #     self.assertEqual(
-    #         self.processor.decode(output[0], skip_special_tokens=True),
-    #         EXPECTED_DECODED_TEXT,
-    #     )
-
-    # @slow
-    # @require_bitsandbytes
-    # def test_small_model_integration_test_batch(self):
-    #     model = LlavaNextForConditionalGeneration.from_pretrained(
-    #         "llava-hf/llava-v1.6-mistral-7b-hf", load_in_4bit=True
-    #     )
-    #     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    #     cats_image = Image.open(requests.get(url, stream=True).raw)
-
-    #     inputs = self.processor(
-    #         [self.prompt, self.prompt],
-    #         images=[self.image, cats_image],
-    #         return_tensors="ms",
-    #         padding=True,
-    #     )
-
-    #     # it should not matter whether two images are the same size or not
-    #     output = model.generate(**inputs, max_new_tokens=20)
-
-    #     EXPECTED_DECODED_TEXT = ['[INST]  \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays', '[INST]  \nWhat is shown in this image? [/INST] The image shows two cats lying on a pink surface, which appears to be a couch or a cush']  # fmt: skip
-    #     self.assertEqual(
-    #         self.processor.batch_decode(output, skip_special_tokens=True),
-    #         EXPECTED_DECODED_TEXT,
-    #     )
-
-    # @slow
-    # @require_bitsandbytes
-    # def test_small_model_integration_test_unk_token(self):
-    #     # related to (#29835)
-    #     model = LlavaNextForConditionalGeneration.from_pretrained(
-    #         "llava-hf/llava-v1.6-mistral-7b-hf",
-    #         load_in_4bit=True,
-    #     )
-
-    #     prompt_with_unk = "[INST] <image>\nWhat is shown in this <unk> image? [/INST]"
-    #     inputs = self.processor(prompt_with_unk, self.image, return_tensors="ms")
-
-    #     # verify single forward pass
-    #     with no_grad():
-    #         output = model(**inputs)
-
-    #     # verify generation
-    #     output = model.generate(**inputs, max_new_tokens=40)
-    #     EXPECTED_DECODED_TEXT = '[INST]  \nWhat is shown in this   image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays values for multiple quantitative variables represented on axes starting from the same point. This particular radar chart'  # fmt: skip
-
-    #     self.assertEqual(
-    #         self.processor.decode(output[0], skip_special_tokens=True),
-    #         EXPECTED_DECODED_TEXT,
-    #     )
-
-    # @slow
-    # @require_bitsandbytes
-    # def test_small_model_integration_test_batch_different_resolutions(self):
-    #     model = LlavaNextForConditionalGeneration.from_pretrained(
-    #         "llava-hf/llava-v1.6-mistral-7b-hf",
-    #         load_in_4bit=True,
-    #     )
-
-    #     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    #     lowres_url = "https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e"
-    #     cats_image = Image.open(requests.get(url, stream=True).raw)
-    #     lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)
-
-    #     inputs = self.processor(
-    #         [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="ms", padding=True
-    #     )
-    #     pixel_values = inputs["pixel_values"]
-
-    #     # verify pixel values are padded correctly with 0 when one image has more num_patches than the other
-    #     image_num_patches = [
-    #         image_size_to_num_patches(
-    #             image_size=imsize,
-    #             grid_pinpoints=model.config.image_grid_pinpoints,
-    #             patch_size=model.config.vision_config.image_size,
-    #         )
-    #         for imsize in inputs["image_sizes"]
-    #     ]
-    #     for pix_val, num_patch in zip(pixel_values, image_num_patches):
-    #         self.assertTrue(ops.all(pix_val[num_patch:] == 0))  # pad on the right
-    #         for i in range(num_patch):
-    #             self.assertFalse(ops.all(pix_val[i : i + 1] == 0))  # no padding expected in any of patches
-
-    #     # check loss when labels are passed
-    #     inputs["labels"] = inputs["input_ids"].clone()
-    #     with no_grad():
-    #         output = model(**inputs)
-
-    #     expected_slice = mindspore.tensor(
-    #         [[-0.0308, -0.0313, -0.0314], [-0.3064, -0.3013, -0.2986], [-0.1226, -0.1246, -0.1210]],
-    #         dtype=mindspore.float32,
-    #     )
-    #     assert ops.allclose(output.logits[0, -3:, -3:], expected_slice, atol=1e-3)
-    #     assert ops.allclose(output.loss, mindspore.tensor(6.8619))
-
-    #     # verify generation
-    #     output = model.generate(**inputs, max_new_tokens=50)
-    #     EXPECTED_DECODED_TEXT = '[INST]  \nWhat is shown in this image? [/INST] The image shows a forested area with a misty or foggy atmosphere. In the foreground, there is a grassy field with a few deer grazing. The deer are partially obscured by the fog, and the trees in the background'  # fmt: skip
-    #     self.assertEqual(
-    #         self.processor.decode(output[0], skip_special_tokens=True),
-    #         EXPECTED_DECODED_TEXT,
-    #     )
-
-    # @slow
-    # @require_bitsandbytes
-    # def test_small_model_integration_test_batch_matches_single(self):
-    #     model = LlavaNextForConditionalGeneration.from_pretrained(
-    #         "llava-hf/llava-v1.6-mistral-7b-hf",
-    #         load_in_4bit=True,
-    #     )
-
-    #     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    #     lowres_url = "https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e"
-    #     cats_image = Image.open(requests.get(url, stream=True).raw)
-    #     lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)
-
-    #     inputs_batched = self.processor(
-    #         [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="ms", padding=True
-    #     )
-
-    #     inputs_single = self.processor(self.prompt, images=lowres_img, return_tensors="ms", padding=True)
-
-    #     # verify generation
-    #     output_batched = model.generate(**inputs_batched, max_new_tokens=50)
-    #     output_single = model.generate(**inputs_single, max_new_tokens=50)
-    #     self.assertEqual(
-    #         self.processor.decode(output_batched[0], skip_special_tokens=True),
-    #         self.processor.decode(output_single[0], skip_special_tokens=True),
-    #     )
-
-    # @slow
-    # @require_bitsandbytes
-    # def test_padding_side_when_merging_inputs(self):
-    #     model = LlavaNextForConditionalGeneration.from_pretrained(
-    #         "llava-hf/llava-v1.6-mistral-7b-hf",
-    #         load_in_4bit=True,
-    #     )
-
-    #     url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    #     lowres_url = "https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e"
-    #     cats_image = Image.open(requests.get(url, stream=True).raw)
-    #     lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)
-
-    #     inputs_batched = self.processor(
-    #         [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="ms", padding=True
-    #     )
-
-    #     # model is in eval mode by default so we should get pad on the left side
-    #     # we can check the first hidden-states (aka inputs embeds)
-    #     # the first element was lo-res image and we expect the first 1414 tokens to be all pads
-    #     output_eval = model(**inputs_batched, output_hidden_states=True)
-    #     self.assertTrue((output_eval.hidden_states[0][0, :1414, ...] == 0).all().item())
-
-    #     # otherwise padding is on the right side, so it's last 1414 tokens
-    #     self.processor.padding_side = "right"
-    #     inputs_batched = self.processor(
-    #         [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="ms", padding=True
-    #     )
-
-    #     model.train()
-    #     with no_grad():
-    #         output_train = model(**inputs_batched, output_hidden_states=True)
-    #     self.assertTrue((output_train.hidden_states[0][0, -1414:, ...] == 0).all().item())
-
-    #     with self.assertLogs("transformers", level="WARNING") as logs:
-    #         model.padding_side = "left"
-    #         model.train()
-    #         model(**inputs_batched, output_hidden_states=True)
-
-    #         self.assertIn(
-    #             "Padding side is set to 'left' but the model is in training mode. For training", logs.output[0]
-    #         )
-
-    #     with self.assertLogs("transformers", level="WARNING") as logs:
-    #         model.padding_side = "right"
-    #         model.eval()
-    #         model(**inputs_batched, output_hidden_states=True)
-
-    #         self.assertIn(
-    #             "Padding side is set to 'right' but the model is in inference mode. For correct", logs.output[0]
-    #         )
-
-    # @slow
-    # @require_bitsandbytes
-    # def test_expansion_in_processing(self):
-    #     model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
-    #     model = LlavaNextForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
-    #     processor = AutoProcessor.from_pretrained(model_id)
-
-    #     prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
-    #     image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    #     raw_image = Image.open(requests.get(image_file, stream=True).raw)
-
-    #     # check processing with expansion of inputs
-    #     processor.vision_feature_select_strategy = "default"
-    #     processor.patch_size = 14
-    #     inputs_expanded = processor(prompt, raw_image, return_tensors="ms").to(mindspore.float16)
-    #     self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2356)
-
-    #     # check processing without expansion of inputs (legacy behavior)
-    #     processor.vision_feature_select_strategy = None
-    #     processor.patch_size = None
-    #     inputs = processor(prompt, raw_image, return_tensors="ms").to(mindspore.float16)
-    #     self.assertTrue(inputs.input_ids.shape[-1] == 17)
-
-    #     # generate exactly 20 tokens
-    #     output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
-    #     output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
-
-    #     # check that both inputs are handled correctly and generate the same output
-    #     self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
\ No newline at end of file
diff --git a/tests/transformers/models/longformer/__init__.py b/tests/transformers/models/longformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/longformer/test_modeling_longformer.py b/tests/transformers/models/longformer/test_modeling_longformer.py
deleted file mode 100644
index 6b7697d14..000000000
--- a/tests/transformers/models/longformer/test_modeling_longformer.py
+++ /dev/null
@@ -1,755 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-import numpy as np
-
-from mindnlp.transformers import LongformerConfig
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        LongformerForMaskedLM,
-        LongformerForMultipleChoice,
-        LongformerForQuestionAnswering,
-        LongformerForSequenceClassification,
-        LongformerForTokenClassification,
-        LongformerModel,
-        LongformerSelfAttention,
-    )
-
-
-class LongformerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        attention_window=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.attention_window = attention_window
-
-        # `ModelTesterMixin.test_attention_outputs` is expecting attention tensors to be of size
-        # [num_attention_heads, encoder_seq_length, encoder_key_length], but LongformerSelfAttention
-        # returns attention of shape [num_attention_heads, encoder_seq_length, self.attention_window + 1]
-        # because its local attention only attends to `self.attention_window + 1` locations
-        # (assuming no token with global attention, otherwise the last dimension of attentions
-        # is x + self.attention_window + 1, where x is the number of tokens with global attention)
-        self.key_length = self.attention_window + 2
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return LongformerConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            attention_window=self.attention_window,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def create_and_check_attention_mask_determinism(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = LongformerModel(config=config)
-
-        model.set_train(False)
-
-        attention_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-        output_with_mask = model(input_ids, attention_mask=attention_mask)["last_hidden_state"]
-        output_without_mask = model(input_ids)["last_hidden_state"]
-        self.parent.assertTrue(np.allclose(output_with_mask[0, 0, :5].asnumpy(), output_without_mask[0, 0, :5].asnumpy(), atol=1e-4))
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = LongformerModel(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_with_global_attention_mask(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = LongformerModel(config=config)
-
-        model.set_train(False)
-        global_attention_mask = input_mask.copy()
-        global_attention_mask[:, input_mask.shape[-1] // 2] = 0
-        global_attention_mask = global_attention_mask
-
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            global_attention_mask=global_attention_mask,
-            token_type_ids=token_type_ids,
-        )
-        result = model(input_ids, token_type_ids=token_type_ids, global_attention_mask=global_attention_mask)
-        result = model(input_ids, global_attention_mask=global_attention_mask)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = LongformerForMaskedLM(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = LongformerForQuestionAnswering(config=config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            global_attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = LongformerForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = LongformerForTokenClassification(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = LongformerForMultipleChoice(config=config)
-
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            global_attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        global_attention_mask = ops.zeros_like(input_ids)
-        global_attention_mask[:, -1] = 1
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-            "global_attention_mask": global_attention_mask,
-        }
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_question_answering(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        # Replace sep_token_id by some random id
-        input_ids[input_ids == config.sep_token_id] = ops.randint(0, config.vocab_size, (1,)).item()
-        # Make sure there are exactly three sep_token_id
-        input_ids[:, -3:] = config.sep_token_id
-        input_mask = ops.ones_like(input_ids)
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-
-@require_mindspore
-class LongformerModelTest(ModelTesterMixin, unittest.TestCase):
-    test_pruning = False  # pruning is not supported
-    test_torchscript = False
-
-    all_model_classes = (
-        (
-            LongformerModel,
-            LongformerForMaskedLM,
-            LongformerForSequenceClassification,
-            LongformerForQuestionAnswering,
-            LongformerForTokenClassification,
-            LongformerForMultipleChoice,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": LongformerModel,
-            "fill-mask": LongformerForMaskedLM,
-            "question-answering": LongformerForQuestionAnswering,
-            "text-classification": LongformerForSequenceClassification,
-            "token-classification": LongformerForTokenClassification,
-            "zero-shot": LongformerForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    # Need to use `0.6` instead of `0.5` for `test_disk_offload`
-    model_split_percents = [0.6, 0.7, 0.9]
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if (
-            pipeline_test_casse_name == "QAPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
-            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
-            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = LongformerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LongformerConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_attention_mask_determinism(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_attention_mask_determinism(*config_and_inputs)
-
-    def test_model_global_attention_mask(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_global_attention_mask(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_question_answering()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # longformer cannot keep gradients in attentions or hidden states
-        return
-
-
-@require_mindspore
-class LongformerModelIntegrationTest(unittest.TestCase):
-    def _get_hidden_states(self):
-        return mindspore.tensor(
-            [
-                [
-                    [
-                        4.98332758e-01,
-                        2.69175139e00,
-                        -7.08081422e-03,
-                        1.04915401e00,
-                        -1.83476661e00,
-                        7.67220476e-01,
-                        2.98580543e-01,
-                        2.84803992e-02,
-                    ],
-                    [
-                        -7.58357372e-01,
-                        4.20635998e-01,
-                        -4.04739919e-02,
-                        1.59924145e-01,
-                        2.05135748e00,
-                        -1.15997978e00,
-                        5.37166397e-01,
-                        2.62873606e-01,
-                    ],
-                    [
-                        -1.69438001e00,
-                        4.17574660e-01,
-                        -1.49196962e00,
-                        -1.76483717e00,
-                        -1.94566312e-01,
-                        -1.71183858e00,
-                        7.72903565e-01,
-                        -1.11557056e00,
-                    ],
-                    [
-                        5.44028163e-01,
-                        2.05466114e-01,
-                        -3.63045868e-01,
-                        2.41865062e-01,
-                        3.20348382e-01,
-                        -9.05611176e-01,
-                        -1.92690727e-01,
-                        -1.19917547e00,
-                    ],
-                ]
-            ],
-            dtype=mindspore.float32,
-        )
-
-    def test_diagonalize(self):
-        hidden_states = self._get_hidden_states()
-        hidden_states = hidden_states.reshape((1, 8, 4))  # set seq length = 8, hidden dim = 4
-        chunked_hidden_states = LongformerSelfAttention._chunk(hidden_states, window_overlap=2)
-        window_overlap_size = chunked_hidden_states.shape[2]
-        self.assertTrue(window_overlap_size == 4)
-
-        padded_hidden_states = LongformerSelfAttention._pad_and_diagonalize(chunked_hidden_states)
-
-        self.assertTrue(padded_hidden_states.shape[-1] == chunked_hidden_states.shape[-1] + window_overlap_size - 1)
-
-        # first row => [0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000]
-        self.assertTrue(np.allclose(padded_hidden_states[0, 0, 0, :4].asnumpy(), chunked_hidden_states[0, 0, 0].asnumpy(), atol=1e-3))
-        self.assertTrue(
-            np.allclose(
-                padded_hidden_states[0, 0, 0, 4:].asnumpy(),
-                ops.zeros((3,), dtype=mindspore.float32).asnumpy(),
-                atol=1e-3,
-            )
-        )
-        # last row => [0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629]
-        self.assertTrue(np.allclose(padded_hidden_states[0, 0, -1, 3:].asnumpy(), chunked_hidden_states[0, 0, -1].asnumpy(), atol=1e-3))
-        self.assertTrue(
-            np.allclose(
-                padded_hidden_states[0, 0, -1, :3].asnumpy(),
-                ops.zeros((3,), dtype=mindspore.float32).asnumpy(),
-                atol=1e-3,
-            )
-        )
-
-    def test_pad_and_transpose_last_two_dims(self):
-        hidden_states = self._get_hidden_states()
-        self.assertEqual(hidden_states.shape, (1, 4, 8))
-        padding = (0, 0, 0, 1)
-
-        padded_hidden_states = LongformerSelfAttention._pad_and_swapaxes_last_two_dims(hidden_states, padding)
-        self.assertEqual(padded_hidden_states.shape, (1, 8, 5))
-
-        expected_added_dim = ops.zeros((5,), dtype=mindspore.float32)
-        self.assertTrue(np.allclose(expected_added_dim.asnumpy(), padded_hidden_states[0, -1, :].asnumpy(), atol=1e-6))
-        self.assertTrue(np.allclose(hidden_states[0, -1, :].asnumpy(), padded_hidden_states.view(1, -1)[0, 24:32].asnumpy(), atol=1e-6))
-
-    def test_chunk(self):
-        hidden_states = self._get_hidden_states()
-        batch_size = 1
-        seq_length = 8
-        hidden_size = 4
-        hidden_states = hidden_states.reshape((batch_size, seq_length, hidden_size))
-
-        chunked_hidden_states = LongformerSelfAttention._chunk(hidden_states, window_overlap=2)
-
-        # expected slices across chunk and seq length dim
-        expected_slice_along_seq_length = mindspore.tensor(
-            [0.4983, -0.7584, -1.6944], dtype=mindspore.float32
-        )
-        expected_slice_along_chunk = mindspore.tensor(
-            [0.4983, -1.8348, -0.7584, 2.0514], dtype=mindspore.float32
-        )
-
-        self.assertTrue(np.allclose(chunked_hidden_states[0, :, 0, 0].asnumpy(), expected_slice_along_seq_length.asnumpy(), atol=1e-3))
-        self.assertTrue(np.allclose(chunked_hidden_states[0, 0, :, 0].asnumpy(), expected_slice_along_chunk.asnumpy(), atol=1e-3))
-        self.assertEqual(chunked_hidden_states.shape, (1, 3, 4, 4))
-
-    def test_mask_invalid_locations(self):
-        hidden_states = self._get_hidden_states()
-
-        batch_size = 1
-        seq_length = 8
-        hidden_size = 4
-        hidden_states = hidden_states.reshape((batch_size, seq_length, hidden_size))
-        chunked_hidden_states = LongformerSelfAttention._chunk(hidden_states, window_overlap=2)
-
-        hid_states_1 = chunked_hidden_states.copy()
-        LongformerSelfAttention._mask_invalid_locations(hid_states_1, 1)
-        self.assertTrue(ops.isinf(hid_states_1).sum().item() == 8)
-
-        hid_states_2 = chunked_hidden_states.copy()
-        LongformerSelfAttention._mask_invalid_locations(hid_states_2, 2)
-        self.assertTrue(ops.isinf(hid_states_2).sum().item() == 24)
-
-        hid_states_3 = chunked_hidden_states.copy()[:, :, :, :3]
-        LongformerSelfAttention._mask_invalid_locations(hid_states_3, 2)
-        self.assertTrue(ops.isinf(hid_states_3).sum().item() == 24)
-
-        hid_states_4 = chunked_hidden_states.copy()[:, :, 2:, :]
-        LongformerSelfAttention._mask_invalid_locations(hid_states_4, 2)
-        self.assertTrue(ops.isinf(hid_states_4).sum().item() == 12)
-
-    def test_layer_local_attn(self):
-        model = LongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny")
-        model.set_train(False)
-        layer = model.encoder.layer[0].attention.self
-        hidden_states = self._get_hidden_states()
-        batch_size, seq_length, hidden_size = hidden_states.shape
-        attention_mask = ops.zeros((batch_size, seq_length), dtype=mindspore.float32)
-        attention_mask[:, -2:] = -10000
-
-        is_index_masked = attention_mask < 0
-        is_index_global_attn = attention_mask > 0
-        is_global_attn = is_index_global_attn.flatten().any().item()
-
-        output_hidden_states = layer(
-            hidden_states,
-            attention_mask=attention_mask,
-            is_index_masked=is_index_masked,
-            is_index_global_attn=is_index_global_attn,
-            is_global_attn=is_global_attn,
-        )[0]
-
-        self.assertEqual(output_hidden_states.shape, (1, 4, 8))
-        self.assertTrue(
-            np.allclose(
-                output_hidden_states[0, 1].asnumpy(),
-                mindspore.tensor(
-                    [0.0019, 0.0122, -0.0171, -0.0256, -0.0300, 0.0173, -0.0115, 0.0048],
-                    dtype=mindspore.float32,
-                ).asnumpy(),
-                atol=1e-3,
-            )
-        )
-
-    def test_layer_global_attn(self):
-        model = LongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny")
-        model.set_train(False)
-        layer = model.encoder.layer[0].attention.self
-        hidden_states = ops.cat([self._get_hidden_states(), self._get_hidden_states() - 0.5], dim=0)
-        batch_size, seq_length, hidden_size = hidden_states.shape
-        attention_mask = ops.zeros((batch_size, seq_length), dtype=mindspore.float32)
-
-        # create attn mask
-        attention_mask[0, -2:] = 10000.0
-        attention_mask[0, -1:] = -10000.0
-        attention_mask[1, 1:] = 10000.0
-
-        is_index_masked = attention_mask < 0
-        is_index_global_attn = attention_mask > 0
-        is_global_attn = is_index_global_attn.flatten().any().item()
-
-        output_hidden_states = layer(
-            hidden_states,
-            attention_mask=attention_mask,
-            is_index_masked=is_index_masked,
-            is_index_global_attn=is_index_global_attn,
-            is_global_attn=is_global_attn,
-        )[0]
-
-        self.assertEqual(output_hidden_states.shape, (2, 4, 8))
-
-        self.assertTrue(
-            np.allclose(
-                output_hidden_states[0, 2].asnumpy(),
-                mindspore.tensor(
-                    [-0.0651, -0.0393, 0.0309, -0.0342, -0.0066, -0.0155, -0.0209, -0.0494],
-                    dtype=mindspore.float32,
-                ).asnumpy(),
-                atol=1e-3,
-            )
-        )
-
-        self.assertTrue(
-            np.allclose(
-                output_hidden_states[1, -2].asnumpy(),
-                mindspore.tensor(
-                    [-0.0405, -0.0384, 0.0396, -0.0374, -0.0341, 0.0136, 0.0014, -0.0571],
-                    dtype=mindspore.float32,
-                ).asnumpy(),
-                atol=1e-3,
-            )
-        )
-
-    def test_layer_attn_probs(self):
-        model = LongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny")
-        model.set_train(False)
-        layer = model.encoder.layer[0].attention.self
-        hidden_states = ops.cat([self._get_hidden_states(), self._get_hidden_states() - 0.5], dim=0)
-        batch_size, seq_length, hidden_size = hidden_states.shape
-        attention_mask = ops.zeros((batch_size, seq_length), dtype=mindspore.float32)
-
-        # create attn mask
-        attention_mask[0, -2:] = 10000.0
-        attention_mask[0, -1:] = -10000.0
-        attention_mask[1, 1:] = 10000.0
-
-        is_index_masked = attention_mask < 0
-        is_index_global_attn = attention_mask > 0
-        is_global_attn = is_index_global_attn.flatten().any().item()
-
-        output_hidden_states, local_attentions, global_attentions = layer(
-            hidden_states,
-            attention_mask=attention_mask,
-            is_index_masked=is_index_masked,
-            is_index_global_attn=is_index_global_attn,
-            is_global_attn=is_global_attn,
-            output_attentions=True,
-        )
-
-        self.assertEqual(local_attentions.shape, (2, 4, 2, 8))
-        self.assertEqual(global_attentions.shape, (2, 2, 3, 4))
-
-        # All tokens with global attention have weight 0 in local attentions.
-        self.assertTrue(ops.all(local_attentions[0, 2:4, :, :] == 0))
-        self.assertTrue(ops.all(local_attentions[1, 1:4, :, :] == 0))
-
-        # The weight of all tokens with local attention must sum to 1.
-        self.assertTrue(ops.all(ops.abs(global_attentions[0, :, :2, :].sum(axis=-1) - 1) < 1e-6))
-        self.assertTrue(ops.all(ops.abs(global_attentions[1, :, :1, :].sum(axis=-1) - 1) < 1e-6))
-
-        self.assertTrue(
-            np.allclose(
-                local_attentions[0, 0, 0, :].asnumpy(),
-                mindspore.tensor(
-                    [0.3328, 0.0000, 0.0000, 0.0000, 0.0000, 0.3355, 0.3318, 0.0000],
-                    dtype=mindspore.float32,
-                ).asnumpy(),
-                atol=1e-3,
-            )
-        )
-
-        self.assertTrue(
-            np.allclose(
-                local_attentions[1, 0, 0, :].asnumpy(),
-                mindspore.tensor(
-                    [0.2492, 0.2502, 0.2502, 0.0000, 0.0000, 0.2505, 0.0000, 0.0000],
-                    dtype=mindspore.float32,
-                ).asnumpy(),
-                atol=1e-3,
-            )
-        )
-
-        # All the global attention weights must sum to 1.
-        self.assertTrue(ops.all(ops.abs(global_attentions.sum(axis=-1) - 1) < 1e-6))
-
-        self.assertTrue(
-            np.allclose(
-                global_attentions[0, 0, 1, :].asnumpy(),
-                mindspore.tensor(
-                    [0.2500, 0.2500, 0.2500, 0.2500],
-                    dtype=mindspore.float32,
-                ).asnumpy(),
-                atol=1e-3,
-            )
-        )
-
-        self.assertTrue(
-            np.allclose(
-                global_attentions[1, 0, 0, :].asnumpy(),
-                mindspore.tensor(
-                    [0.2497, 0.2500, 0.2499, 0.2504],
-                    dtype=mindspore.float32,
-                ).asnumpy(),
-                atol=1e-3,
-            )
-        )
-
-    @slow
-    def test_inference_no_head(self):
-        model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
-
-
-        # 'Hello world!'
-        input_ids = mindspore.tensor([[0, 20920, 232, 328, 1437, 2]], dtype=mindspore.int64)
-        attention_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        output = model(input_ids, attention_mask=attention_mask)[0]
-        output_without_mask = model(input_ids)[0]
-
-        expected_output_slice = mindspore.tensor([0.0549, 0.1087, -0.1119, -0.0368, 0.0250])
-        self.assertTrue(np.allclose(output[0, 0, -5:].asnumpy(), expected_output_slice.asnumpy(), atol=1e-4))
-        self.assertTrue(np.allclose(output_without_mask[0, 0, -5:].asnumpy(), expected_output_slice.asnumpy(), atol=1e-4))
-
-    @slow
-    def test_inference_no_head_long(self):
-        model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
-
-
-        # 'Hello world! ' repeated 1000 times
-        input_ids = mindspore.tensor(
-            [[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=mindspore.int64
-        )  # long input
-
-        attention_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-        global_attention_mask = ops.zeros(input_ids.shape, dtype=mindspore.int64)
-        global_attention_mask[:, [1, 4, 21]] = 1  # Set global attention on a few random positions
-
-        output = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)[0]
-
-        expected_output_sum = mindspore.tensor(74585.8594)
-        expected_output_mean = mindspore.tensor(0.0243)
-        self.assertTrue(np.allclose(output.sum().asnumpy(), expected_output_sum.asnumpy(), atol=1e-4))
-        self.assertTrue(np.allclose(output.mean().asnumpy(), expected_output_mean.asnumpy(), atol=1e-4))
-
-    @slow
-    def test_inference_masked_lm_long(self):
-        model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
-
-
-        # 'Hello world! ' repeated 1000 times
-        input_ids = mindspore.tensor(
-            [[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=mindspore.int64
-        )  # long input
-        input_ids = input_ids
-
-        loss, prediction_scores = model(input_ids, labels=input_ids).to_tuple()
-
-        expected_loss = mindspore.tensor(0.0074)
-        expected_prediction_scores_sum = mindspore.tensor(-6.1048e08)
-        expected_prediction_scores_mean = mindspore.tensor(-3.0348)
-
-        self.assertTrue(np.allclose(loss.asnumpy(), expected_loss.asnumpy(), atol=1e-4))
-        self.assertTrue(np.allclose(prediction_scores.sum().asnumpy(), expected_prediction_scores_sum.asnumpy(), atol=1e-4))
-        self.assertTrue(np.allclose(prediction_scores.mean().asnumpy(), expected_prediction_scores_mean.asnumpy(), atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/luke/__init__.py b/tests/transformers/models/luke/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/luke/test_modeling_luke.py b/tests/transformers/models/luke/test_modeling_luke.py
deleted file mode 100644
index e848bdc12..000000000
--- a/tests/transformers/models/luke/test_modeling_luke.py
+++ /dev/null
@@ -1,906 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore LUKE model."""
-
-import unittest
-
-from mindnlp.transformers import LukeConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipvelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore.common.api import _no_grad
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        LukeForEntityClassification,
-        LukeForEntityPairClassification,
-        LukeForEntitySpanClassification,
-        LukeForMaskedLM,
-        LukeForMultipleChoice,
-        LukeForQuestionAnswering,
-        LukeForSequenceClassification,
-        LukeForTokenClassification,
-        LukeModel,
-        LukeTokenizer,
-    )
-
-
-class LukeModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        entity_length=3,
-        mention_length=5,
-        use_attention_mask=True,
-        use_token_type_ids=True,
-        use_entity_ids=True,
-        use_entity_attention_mask=True,
-        use_entity_token_type_ids=True,
-        use_entity_position_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        entity_vocab_size=10,
-        entity_emb_size=6,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        num_entity_classification_labels=9,
-        num_entity_pair_classification_labels=6,
-        num_entity_span_classification_labels=4,
-        use_entity_aware_attention=True,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.entity_length = entity_length
-        self.mention_length = mention_length
-        self.use_attention_mask = use_attention_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_entity_ids = use_entity_ids
-        self.use_entity_attention_mask = use_entity_attention_mask
-        self.use_entity_token_type_ids = use_entity_token_type_ids
-        self.use_entity_position_ids = use_entity_position_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.entity_vocab_size = entity_vocab_size
-        self.entity_emb_size = entity_emb_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.num_entity_classification_labels = num_entity_classification_labels
-        self.num_entity_pair_classification_labels = num_entity_pair_classification_labels
-        self.num_entity_span_classification_labels = num_entity_span_classification_labels
-        self.scope = scope
-        self.use_entity_aware_attention = use_entity_aware_attention
-
-        self.encoder_seq_length = seq_length
-        self.key_length = seq_length
-        self.num_hidden_states_types = 2  # hidden_states and entity_hidden_states
-
-    def prepare_config_and_inputs(self):
-        # prepare words
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        # prepare entities
-        entity_ids = ids_tensor([self.batch_size, self.entity_length], self.entity_vocab_size)
-
-        entity_attention_mask = None
-        if self.use_entity_attention_mask:
-            entity_attention_mask = random_attention_mask([self.batch_size, self.entity_length])
-
-        entity_token_type_ids = None
-        if self.use_token_type_ids:
-            entity_token_type_ids = ids_tensor([self.batch_size, self.entity_length], self.type_vocab_size)
-
-        entity_position_ids = None
-        if self.use_entity_position_ids:
-            entity_position_ids = ids_tensor(
-                [self.batch_size, self.entity_length, self.mention_length], self.mention_length
-            )
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        entity_labels = None
-        entity_classification_labels = None
-        entity_pair_classification_labels = None
-        entity_span_classification_labels = None
-
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-            entity_labels = ids_tensor([self.batch_size, self.entity_length], self.entity_vocab_size)
-
-            entity_classification_labels = ids_tensor([self.batch_size], self.num_entity_classification_labels)
-            entity_pair_classification_labels = ids_tensor(
-                [self.batch_size], self.num_entity_pair_classification_labels
-            )
-            entity_span_classification_labels = ids_tensor(
-                [self.batch_size, self.entity_length], self.num_entity_span_classification_labels
-            )
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            token_type_ids,
-            entity_ids,
-            entity_attention_mask,
-            entity_token_type_ids,
-            entity_position_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            entity_labels,
-            entity_classification_labels,
-            entity_pair_classification_labels,
-            entity_span_classification_labels,
-        )
-
-    def get_config(self):
-        return LukeConfig(
-            vocab_size=self.vocab_size,
-            entity_vocab_size=self.entity_vocab_size,
-            entity_emb_size=self.entity_emb_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            use_entity_aware_attention=self.use_entity_aware_attention,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        token_type_ids,
-        entity_ids,
-        entity_attention_mask,
-        entity_token_type_ids,
-        entity_position_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        entity_labels,
-        entity_classification_labels,
-        entity_pair_classification_labels,
-        entity_span_classification_labels,
-    ):
-        model = LukeModel(config=config)
-        model.eval()
-        # test with words + entities
-        result = model(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            entity_ids=entity_ids,
-            entity_attention_mask=entity_attention_mask,
-            entity_token_type_ids=entity_token_type_ids,
-            entity_position_ids=entity_position_ids,
-        )
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(
-            result.entity_last_hidden_state.shape, (self.batch_size, self.entity_length, self.hidden_size)
-        )
-
-        # test with words only
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        token_type_ids,
-        entity_ids,
-        entity_attention_mask,
-        entity_token_type_ids,
-        entity_position_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        entity_labels,
-        entity_classification_labels,
-        entity_pair_classification_labels,
-        entity_span_classification_labels,
-    ):
-        config.num_labels = self.num_entity_classification_labels
-        model = LukeForMaskedLM(config)
-        model.eval()
-
-        result = model(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            entity_ids=entity_ids,
-            entity_attention_mask=entity_attention_mask,
-            entity_token_type_ids=entity_token_type_ids,
-            entity_position_ids=entity_position_ids,
-            labels=token_labels,
-            entity_labels=entity_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        if entity_ids is not None:
-            self.parent.assertEqual(
-                result.entity_logits.shape, (self.batch_size, self.entity_length, self.entity_vocab_size)
-            )
-        else:
-            self.parent.assertIsNone(result.entity_logits)
-
-    def create_and_check_for_entity_classification(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        token_type_ids,
-        entity_ids,
-        entity_attention_mask,
-        entity_token_type_ids,
-        entity_position_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        entity_labels,
-        entity_classification_labels,
-        entity_pair_classification_labels,
-        entity_span_classification_labels,
-    ):
-        config.num_labels = self.num_entity_classification_labels
-        model = LukeForEntityClassification(config)
-        model.eval()
-
-        result = model(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            entity_ids=entity_ids,
-            entity_attention_mask=entity_attention_mask,
-            entity_token_type_ids=entity_token_type_ids,
-            entity_position_ids=entity_position_ids,
-            labels=entity_classification_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_entity_classification_labels))
-
-    def create_and_check_for_entity_pair_classification(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        token_type_ids,
-        entity_ids,
-        entity_attention_mask,
-        entity_token_type_ids,
-        entity_position_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        entity_labels,
-        entity_classification_labels,
-        entity_pair_classification_labels,
-        entity_span_classification_labels,
-    ):
-        config.num_labels = self.num_entity_pair_classification_labels
-        model = LukeForEntityClassification(config)
-        model.eval()
-
-        result = model(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            entity_ids=entity_ids,
-            entity_attention_mask=entity_attention_mask,
-            entity_token_type_ids=entity_token_type_ids,
-            entity_position_ids=entity_position_ids,
-            labels=entity_pair_classification_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_entity_pair_classification_labels))
-
-    def create_and_check_for_entity_span_classification(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        token_type_ids,
-        entity_ids,
-        entity_attention_mask,
-        entity_token_type_ids,
-        entity_position_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        entity_labels,
-        entity_classification_labels,
-        entity_pair_classification_labels,
-        entity_span_classification_labels,
-    ):
-        config.num_labels = self.num_entity_span_classification_labels
-        model = LukeForEntitySpanClassification(config)
-        model.eval()
-
-        entity_start_positions = ids_tensor([self.batch_size, self.entity_length], self.seq_length)
-        entity_end_positions = ids_tensor([self.batch_size, self.entity_length], self.seq_length)
-
-        result = model(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            entity_ids=entity_ids,
-            entity_attention_mask=entity_attention_mask,
-            entity_token_type_ids=entity_token_type_ids,
-            entity_position_ids=entity_position_ids,
-            entity_start_positions=entity_start_positions,
-            entity_end_positions=entity_end_positions,
-            labels=entity_span_classification_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.entity_length, self.num_entity_span_classification_labels)
-        )
-
-    def create_and_check_for_question_answering(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        token_type_ids,
-        entity_ids,
-        entity_attention_mask,
-        entity_token_type_ids,
-        entity_position_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        entity_labels,
-        entity_classification_labels,
-        entity_pair_classification_labels,
-        entity_span_classification_labels,
-    ):
-        model = LukeForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            entity_ids=entity_ids,
-            entity_attention_mask=entity_attention_mask,
-            entity_token_type_ids=entity_token_type_ids,
-            entity_position_ids=entity_position_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        token_type_ids,
-        entity_ids,
-        entity_attention_mask,
-        entity_token_type_ids,
-        entity_position_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        entity_labels,
-        entity_classification_labels,
-        entity_pair_classification_labels,
-        entity_span_classification_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = LukeForSequenceClassification(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            entity_ids=entity_ids,
-            entity_attention_mask=entity_attention_mask,
-            entity_token_type_ids=entity_token_type_ids,
-            entity_position_ids=entity_position_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        token_type_ids,
-        entity_ids,
-        entity_attention_mask,
-        entity_token_type_ids,
-        entity_position_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        entity_labels,
-        entity_classification_labels,
-        entity_pair_classification_labels,
-        entity_span_classification_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = LukeForTokenClassification(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            entity_ids=entity_ids,
-            entity_attention_mask=entity_attention_mask,
-            entity_token_type_ids=entity_token_type_ids,
-            entity_position_ids=entity_position_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        token_type_ids,
-        entity_ids,
-        entity_attention_mask,
-        entity_token_type_ids,
-        entity_position_ids,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        entity_labels,
-        entity_classification_labels,
-        entity_pair_classification_labels,
-        entity_span_classification_labels,
-    ):
-        config.num_choices = self.num_choices
-        model = LukeForMultipleChoice(config=config)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_attention_mask = attention_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_entity_ids = entity_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_entity_token_type_ids = (
-            entity_token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        )
-        multiple_choice_entity_attention_mask = (
-            entity_attention_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        )
-        multiple_choice_entity_position_ids = (
-            entity_position_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1, -1))
-        )
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_attention_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            entity_ids=multiple_choice_entity_ids,
-            entity_attention_mask=multiple_choice_entity_attention_mask,
-            entity_token_type_ids=multiple_choice_entity_token_type_ids,
-            entity_position_ids=multiple_choice_entity_position_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            token_type_ids,
-            entity_ids,
-            entity_attention_mask,
-            entity_token_type_ids,
-            entity_position_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            entity_labels,
-            entity_classification_labels,
-            entity_pair_classification_labels,
-            entity_span_classification_labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": attention_mask,
-            "entity_ids": entity_ids,
-            "entity_token_type_ids": entity_token_type_ids,
-            "entity_attention_mask": entity_attention_mask,
-            "entity_position_ids": entity_position_ids,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class LukeModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            LukeModel,
-            LukeForMaskedLM,
-            LukeForEntityClassification,
-            LukeForEntityPairClassification,
-            LukeForEntitySpanClassification,
-            LukeForQuestionAnswering,
-            LukeForSequenceClassification,
-            LukeForTokenClassification,
-            LukeForMultipleChoice,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": LukeModel,
-            "fill-mask": LukeForMaskedLM,
-            "question-answering": LukeForQuestionAnswering,
-            "text-classification": LukeForSequenceClassification,
-            "token-classification": LukeForTokenClassification,
-            "zero-shot": LukeForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_resize_embeddings = True
-    test_head_masking = True
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name in ["QAPipelineTests", "ZeroShotClassificationPipelineTests"]:
-            return True
-
-        return False
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        entity_inputs_dict = {k: v for k, v in inputs_dict.items() if k.startswith("entity")}
-        inputs_dict = {k: v for k, v in inputs_dict.items() if not k.startswith("entity")}
-
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-        if model_class == LukeForMultipleChoice:
-            entity_inputs_dict = {
-                k: v.unsqueeze(1).broadcast_to((-1, self.model_tester.num_choices, -1))
-                if v.ndim == 2
-                else v.unsqueeze(1).broadcast_to((-1, self.model_tester.num_choices, -1, -1))
-                for k, v in entity_inputs_dict.items()
-            }
-        inputs_dict.update(entity_inputs_dict)
-
-        if model_class == LukeForEntitySpanClassification:
-            inputs_dict["entity_start_positions"] = ops.zeros(
-                (self.model_tester.batch_size, self.model_tester.entity_length), dtype=mindspore.int64
-            )
-            inputs_dict["entity_end_positions"] = ops.ones(
-                (self.model_tester.batch_size, self.model_tester.entity_length), dtype=mindspore.int64
-            )
-
-        if return_labels:
-            if model_class in (
-                LukeForEntityClassification,
-                LukeForEntityPairClassification,
-                LukeForSequenceClassification,
-                LukeForMultipleChoice,
-            ):
-                inputs_dict["labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-            elif model_class == LukeForEntitySpanClassification:
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.entity_length),
-                    dtype=mindspore.int64,
-                )
-            elif model_class == LukeForTokenClassification:
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=mindspore.int64,
-                )
-            elif model_class == LukeForMaskedLM:
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=mindspore.int64,
-                )
-                inputs_dict["entity_labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.entity_length),
-                    dtype=mindspore.int64,
-                )
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = LukeModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=LukeConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "studio-ousia/luke-base"
-        model = LukeModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_masked_lm_with_word_only(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config_and_inputs = (*config_and_inputs[:4], *((None,) * len(config_and_inputs[4:])))
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_entity_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_entity_classification(*config_and_inputs)
-
-    def test_for_entity_pair_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_entity_pair_classification(*config_and_inputs)
-
-    def test_for_entity_span_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_entity_span_classification(*config_and_inputs)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_length = self.model_tester.seq_length
-        entity_length = self.model_tester.entity_length
-        key_length = seq_length + entity_length
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            with _no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with _no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_length + entity_length, key_length],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with _no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            added_hidden_states = self.model_tester.num_hidden_states_types
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_length + entity_length, key_length],
-            )
-
-    def test_entity_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with _no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            entity_hidden_states = outputs.entity_hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(entity_hidden_states), expected_num_layers)
-
-            entity_length = self.model_tester.entity_length
-
-            self.assertListEqual(
-                list(entity_hidden_states[0].shape[-2:]),
-                [entity_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-
-@require_mindspore
-class LukeModelIntegrationTests(unittest.TestCase):
-    @slow
-    def test_inference_base_model(self):
-        model = LukeModel.from_pretrained("studio-ousia/luke-base").eval()
-
-        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base", task="entity_classification")
-        text = (
-            "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped"
-            " the new world number one avoid a humiliating second- round exit at Wimbledon ."
-        )
-        span = (39, 42)
-        encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="ms")
-
-        # move all values to device
-        for key, value in encoding.items():
-            encoding[key] = encoding[key]
-
-        outputs = model(**encoding)
-
-        # Verify word hidden states
-        expected_shape = (1, 42, 768)
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[0.0037, 0.1368, -0.0091], [0.1099, 0.3329, -0.1095], [0.0765, 0.5335, 0.1179]]
-        )
-        self.assertTrue(ops.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
-
-        # Verify entity hidden states
-        expected_shape = (1, 1, 768)
-        self.assertEqual(outputs.entity_last_hidden_state.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([[0.1457, 0.1044, 0.0174]])
-        self.assertTrue(ops.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_large_model(self):
-        model = LukeModel.from_pretrained("studio-ousia/luke-large").eval()
-
-        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large", task="entity_classification")
-        text = (
-            "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped"
-            " the new world number one avoid a humiliating second- round exit at Wimbledon ."
-        )
-        span = (39, 42)
-        encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="ms")
-
-        # move all values to device
-        for key, value in encoding.items():
-            encoding[key] = encoding[key]
-
-        outputs = model(**encoding)
-
-        # Verify word hidden states
-        expected_shape = (1, 42, 1024)
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[0.0133, 0.0865, 0.0095], [0.3093, -0.2576, -0.7418], [-0.1720, -0.2117, -0.2869]]
-        )
-        self.assertTrue(ops.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
-
-        # Verify entity hidden states
-        expected_shape = (1, 1, 1024)
-        self.assertEqual(outputs.entity_last_hidden_state.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([[0.0466, -0.0106, -0.0179]])
-        self.assertTrue(ops.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/lxmert/__init__.py b/tests/transformers/models/lxmert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/lxmert/test_modeling_lxmert.py b/tests/transformers/models/lxmert/test_modeling_lxmert.py
deleted file mode 100644
index 426098223..000000000
--- a/tests/transformers/models/lxmert/test_modeling_lxmert.py
+++ /dev/null
@@ -1,908 +0,0 @@
-# coding=utf-8
-# Copyright 2018 LXMERT Authors, The Hugging Face Team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import copy
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import LxmertConfig
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindnlp.core import ops
-    from mindnlp.transformers.models.auto.modeling_auto import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-    )
-    from mindnlp.transformers import (
-        LxmertForPreTraining,
-        LxmertForQuestionAnswering,
-        LxmertModel,
-    )
-
-class LxmertModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=300,
-        hidden_size=28,
-        num_attention_heads=2,
-        num_labels=2,
-        intermediate_size=64,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        pad_token_id=0,
-        num_qa_labels=30,
-        num_object_labels=16,
-        num_attr_labels=4,
-        num_visual_features=10,
-        l_layers=2,
-        x_layers=1,
-        r_layers=1,
-        visual_feat_dim=128,
-        visual_pos_dim=4,
-        visual_loss_normalizer=6.67,
-        seq_length=20,
-        batch_size=4,
-        is_training=True,
-        task_matched=True,
-        task_mask_lm=True,
-        task_obj_predict=True,
-        task_qa=True,
-        visual_obj_loss=True,
-        visual_attr_loss=True,
-        visual_feat_loss=True,
-        use_token_type_ids=True,
-        use_lang_mask=True,
-        output_attentions=False,
-        output_hidden_states=False,
-        scope=None,
-    ):
-        self.parent = parent
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_attention_heads = num_attention_heads
-        self.num_labels = num_labels
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.pad_token_id = pad_token_id
-        self.num_qa_labels = num_qa_labels
-        self.num_object_labels = num_object_labels
-        self.num_attr_labels = num_attr_labels
-        self.l_layers = l_layers
-        self.x_layers = x_layers
-        self.r_layers = r_layers
-        self.visual_feat_dim = visual_feat_dim
-        self.visual_pos_dim = visual_pos_dim
-        self.visual_loss_normalizer = visual_loss_normalizer
-        self.seq_length = seq_length
-        self.batch_size = batch_size
-        self.is_training = is_training
-        self.use_lang_mask = use_lang_mask
-        self.task_matched = task_matched
-        self.task_mask_lm = task_mask_lm
-        self.task_obj_predict = task_obj_predict
-        self.task_qa = task_qa
-        self.visual_obj_loss = visual_obj_loss
-        self.visual_attr_loss = visual_attr_loss
-        self.visual_feat_loss = visual_feat_loss
-        self.num_visual_features = num_visual_features
-        self.use_token_type_ids = use_token_type_ids
-        self.output_attentions = output_attentions
-        self.output_hidden_states = output_hidden_states
-        self.scope = scope
-        self.num_hidden_layers = {
-            "vision": r_layers,
-            "cross_encoder": x_layers,
-            "language": l_layers,
-        }
-
-    def prepare_config_and_inputs(self):
-        output_attentions = self.output_attentions
-        input_ids = ids_tensor(
-            [self.batch_size, self.seq_length], vocab_size=self.vocab_size
-        )
-        visual_feats = ops.rand(
-            self.batch_size,
-            self.num_visual_features,
-            self.visual_feat_dim,
-        )
-        bounding_boxes = ops.rand(self.batch_size, self.num_visual_features, 4)
-
-        input_mask = None
-        if self.use_lang_mask:
-            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor(
-                [self.batch_size, self.seq_length], self.type_vocab_size
-            )
-        obj_labels = None
-        if self.task_obj_predict:
-            obj_labels = {}
-        if self.visual_attr_loss and self.task_obj_predict:
-            obj_labels["attr"] = (
-                ids_tensor(
-                    [self.batch_size, self.num_visual_features], self.num_attr_labels
-                ),
-                ids_tensor(
-                    [self.batch_size, self.num_visual_features], self.num_attr_labels
-                ),
-            )
-        if self.visual_feat_loss and self.task_obj_predict:
-            obj_labels["feat"] = (
-                ids_tensor(
-                    [self.batch_size, self.num_visual_features, self.visual_feat_dim],
-                    self.num_visual_features,
-                ),
-                ids_tensor(
-                    [self.batch_size, self.num_visual_features],
-                    self.num_visual_features,
-                ),
-            )
-        if self.visual_obj_loss and self.task_obj_predict:
-            obj_labels["obj"] = (
-                ids_tensor(
-                    [self.batch_size, self.num_visual_features], self.num_object_labels
-                ),
-                ids_tensor(
-                    [self.batch_size, self.num_visual_features], self.num_object_labels
-                ),
-            )
-        ans = None
-        if self.task_qa:
-            ans = ids_tensor([self.batch_size], self.num_qa_labels)
-        masked_lm_labels = None
-        if self.task_mask_lm:
-            masked_lm_labels = ids_tensor(
-                [self.batch_size, self.seq_length], self.vocab_size
-            )
-        matched_label = None
-        if self.task_matched:
-            matched_label = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids,
-            input_mask,
-            obj_labels,
-            masked_lm_labels,
-            matched_label,
-            ans,
-            output_attentions,
-        )
-
-    def get_config(self):
-        return LxmertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_attention_heads=self.num_attention_heads,
-            num_labels=self.num_labels,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            layer_norm_eps=self.layer_norm_eps,
-            pad_token_id=self.pad_token_id,
-            num_qa_labels=self.num_qa_labels,
-            num_object_labels=self.num_object_labels,
-            num_attr_labels=self.num_attr_labels,
-            l_layers=self.l_layers,
-            x_layers=self.x_layers,
-            r_layers=self.r_layers,
-            visual_feat_dim=self.visual_feat_dim,
-            visual_pos_dim=self.visual_pos_dim,
-            visual_loss_normalizer=self.visual_loss_normalizer,
-            task_matched=self.task_matched,
-            task_mask_lm=self.task_mask_lm,
-            task_obj_predict=self.task_obj_predict,
-            task_qa=self.task_qa,
-            visual_obj_loss=self.visual_obj_loss,
-            visual_attr_loss=self.visual_attr_loss,
-            visual_feat_loss=self.visual_feat_loss,
-            output_attentions=self.output_attentions,
-            output_hidden_states=self.output_hidden_states,
-        )
-
-    def create_and_check_lxmert_model(
-        self,
-        config,
-        input_ids,
-        visual_feats,
-        bounding_boxes,
-        token_type_ids,
-        input_mask,
-        obj_labels,
-        masked_lm_labels,
-        matched_label,
-        ans,
-        output_attentions,
-    ):
-        model = LxmertModel(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            output_attentions=output_attentions,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            output_attentions=not output_attentions,
-        )
-        result = model(input_ids, visual_feats, bounding_boxes, return_dict=False)
-        result = model(input_ids, visual_feats, bounding_boxes, return_dict=True)
-
-        self.parent.assertEqual(
-            result.language_output.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-        self.parent.assertEqual(
-            result.vision_output.shape,
-            (self.batch_size, self.num_visual_features, self.hidden_size),
-        )
-        self.parent.assertEqual(
-            result.pooled_output.shape, (self.batch_size, self.hidden_size)
-        )
-
-    def create_and_check_lxmert_for_question_answering(
-        self,
-        config,
-        input_ids,
-        visual_feats,
-        bounding_boxes,
-        token_type_ids,
-        input_mask,
-        obj_labels,
-        masked_lm_labels,
-        matched_label,
-        ans,
-        output_attentions,
-    ):
-        model = LxmertForQuestionAnswering(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            labels=ans,
-            output_attentions=output_attentions,
-        )
-        result = model(input_ids, visual_feats, bounding_boxes, labels=ans)
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            labels=ans,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            output_attentions=output_attentions,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            labels=ans,
-            output_attentions=not output_attentions,
-        )
-
-        self.parent.assertEqual(
-            result.question_answering_score.shape, (self.batch_size, self.num_qa_labels)
-        )
-
-    def create_and_check_lxmert_for_pretraining(
-        self,
-        config,
-        input_ids,
-        visual_feats,
-        bounding_boxes,
-        token_type_ids,
-        input_mask,
-        obj_labels,
-        masked_lm_labels,
-        matched_label,
-        ans,
-        output_attentions,
-    ):
-        model = LxmertForPreTraining(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            masked_lm_labels=masked_lm_labels,
-            obj_labels=obj_labels,
-            matched_label=matched_label,
-            ans=ans,
-            output_attentions=output_attentions,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            masked_lm_labels=masked_lm_labels,
-            output_attentions=not output_attentions,
-            return_dict=False,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            masked_lm_labels=masked_lm_labels,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            obj_labels=obj_labels,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            matched_label=matched_label,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            ans=ans,
-        )
-        result = model(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            masked_lm_labels=masked_lm_labels,
-            obj_labels=obj_labels,
-            matched_label=matched_label,
-            ans=ans,
-            output_attentions=not output_attentions,
-        )
-
-        self.parent.assertEqual(
-            result.prediction_logits.shape,
-            (self.batch_size, self.seq_length, self.vocab_size),
-        )
-
-    def resize_lxmert_num_qa_labels(
-        self,
-        config,
-        input_ids,
-        visual_feats,
-        bounding_boxes,
-        token_type_ids,
-        input_mask,
-        obj_labels,
-        masked_lm_labels,
-        matched_label,
-        ans,
-        output_attentions,
-    ):
-        start_labels = config.num_qa_labels
-        num_large_labels = config.num_qa_labels * 2
-        num_small_labels = int(config.num_qa_labels * 2)
-        less_labels_ans = ids_tensor([self.batch_size], num_small_labels)
-        more_labels_ans = ids_tensor([self.batch_size], num_large_labels)
-        model_pretrain = LxmertForPreTraining(config=config)
-        model_qa = LxmertForQuestionAnswering(config=config)
-        config.num_labels = num_small_labels
-        end_labels = config.num_labels
-
-        result_pretrain = model_pretrain(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            ans=ans,
-        )
-
-        result_qa = model_qa(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            labels=ans,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-        )
-
-        model_pretrain.resize_num_qa_labels(num_small_labels)
-        model_qa.resize_num_qa_labels(num_small_labels)
-
-        result_pretrain_less = model_pretrain(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            ans=less_labels_ans,
-        )
-
-        result_qa_less = model_qa(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            labels=less_labels_ans,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-        )
-
-        model_pretrain.resize_num_qa_labels(num_large_labels)
-        model_qa.resize_num_qa_labels(num_large_labels)
-
-        result_pretrain_more = model_pretrain(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-            ans=more_labels_ans,
-        )
-
-        result_qa_more = model_qa(
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            labels=more_labels_ans,
-            token_type_ids=token_type_ids,
-            attention_mask=input_mask,
-        )
-
-        model_qa_labels = model_qa.num_qa_labels
-
-        self.parent.assertNotEqual(start_labels, end_labels)
-        self.parent.assertNotEqual(model_qa_labels, start_labels)
-        self.parent.assertEqual(
-            result_qa.question_answering_score.shape, (self.batch_size, start_labels)
-        )
-        self.parent.assertEqual(
-            result_pretrain.question_answering_score.shape,
-            (self.batch_size, start_labels),
-        )
-        self.parent.assertEqual(
-            result_qa_less.question_answering_score.shape,
-            (self.batch_size, num_small_labels),
-        )
-        self.parent.assertEqual(
-            result_pretrain_less.question_answering_score.shape,
-            (self.batch_size, num_small_labels),
-        )
-        self.parent.assertEqual(
-            result_qa_more.question_answering_score.shape,
-            (self.batch_size, num_large_labels),
-        )
-        self.parent.assertEqual(
-            result_pretrain_more.question_answering_score.shape,
-            (self.batch_size, num_large_labels),
-        )
-
-    def prepare_config_and_inputs_for_common(self, return_obj_labels=False):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            visual_feats,
-            bounding_boxes,
-            token_type_ids,
-            input_mask,
-            obj_labels,
-            masked_lm_labels,
-            matched_label,
-            ans,
-            output_attentions,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "visual_feats": visual_feats,
-            "visual_pos": bounding_boxes,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-
-        if return_obj_labels:
-            inputs_dict["obj_labels"] = obj_labels
-        else:
-            config.task_obj_predict = False
-
-        return config, inputs_dict
-
-
-@require_mindspore
-class LxmertModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (LxmertModel, LxmertForPreTraining, LxmertForQuestionAnswering)
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": LxmertModel,
-            "question-answering": LxmertForQuestionAnswering,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    fx_compatible = True
-    test_head_masking = False
-    test_pruning = False
-    test_torchscript = False
-
-    # overwrite function because qa models takes different input label shape
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
-                inputs_dict["labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=ms.int64
-                )
-            elif model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                # special case for models like BERT that use multi-loss training for PreTraining
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=ms.int64,
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = LxmertModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=LxmertConfig, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_lxmert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lxmert_model(*config_and_inputs)
-
-    def test_lxmert_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lxmert_for_question_answering(
-            *config_and_inputs
-        )
-
-    def test_lxmert_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lxmert_for_pretraining(*config_and_inputs)
-
-    def test_lxmert_question_answering_labels_resize(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.resize_lxmert_num_qa_labels(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "unc-nlp/lxmert-base-uncased"
-        model = LxmertModel.from_pretrained(model_name, from_pt=True)
-        self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        encoder_key_length = getattr(
-            self.model_tester, "key_length", encoder_seq_length
-        )
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            language_attentions, vision_attentions, cross_encoder_attentions = (
-                outputs[-3],
-                outputs[-2],
-                outputs[-1],
-            )
-
-            self.assertEqual(
-                len(language_attentions),
-                self.model_tester.num_hidden_layers["language"],
-            )
-            self.assertEqual(
-                len(vision_attentions), self.model_tester.num_hidden_layers["vision"]
-            )
-            self.assertEqual(
-                len(cross_encoder_attentions),
-                self.model_tester.num_hidden_layers["cross_encoder"],
-            )
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            language_attentions, vision_attentions, cross_encoder_attentions = (
-                outputs[-3],
-                outputs[-2],
-                outputs[-1],
-            )
-            self.assertEqual(
-                len(language_attentions),
-                self.model_tester.num_hidden_layers["language"],
-            )
-            self.assertEqual(
-                len(vision_attentions), self.model_tester.num_hidden_layers["vision"]
-            )
-            self.assertEqual(
-                len(cross_encoder_attentions),
-                self.model_tester.num_hidden_layers["cross_encoder"],
-            )
-
-            attentions = [
-                language_attentions,
-                vision_attentions,
-                cross_encoder_attentions,
-            ]
-            attention_shapes = [
-                [
-                    self.model_tester.num_attention_heads,
-                    encoder_seq_length,
-                    encoder_key_length,
-                ],
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_visual_features,
-                    self.model_tester.num_visual_features,
-                ],
-                [
-                    self.model_tester.num_attention_heads,
-                    encoder_key_length,
-                    self.model_tester.num_visual_features,
-                ],
-            ]
-
-            for attention, attention_shape in zip(attentions, attention_shapes):
-                self.assertListEqual(list(attention[0].shape[-3:]), attention_shape)
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # 2 hidden states were added
-            self.assertEqual(out_len + 2, len(outputs))
-
-            language_attentions, vision_attentions, cross_encoder_attentions = (
-                outputs[-3],
-                outputs[-2],
-                outputs[-1],
-            )
-            self.assertEqual(
-                len(language_attentions),
-                self.model_tester.num_hidden_layers["language"],
-            )
-            self.assertEqual(
-                len(vision_attentions), self.model_tester.num_hidden_layers["vision"]
-            )
-            self.assertEqual(
-                len(cross_encoder_attentions),
-                self.model_tester.num_hidden_layers["cross_encoder"],
-            )
-
-            attentions = [
-                language_attentions,
-                vision_attentions,
-                cross_encoder_attentions,
-            ]
-            attention_shapes = [
-                [
-                    self.model_tester.num_attention_heads,
-                    encoder_seq_length,
-                    encoder_key_length,
-                ],
-                [
-                    self.model_tester.num_attention_heads,
-                    self.model_tester.num_visual_features,
-                    self.model_tester.num_visual_features,
-                ],
-                [
-                    self.model_tester.num_attention_heads,
-                    encoder_key_length,
-                    self.model_tester.num_visual_features,
-                ],
-            ]
-
-            for attention, attention_shape in zip(attentions, attention_shapes):
-                self.assertListEqual(list(attention[0].shape[-3:]), attention_shape)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            language_hidden_states, vision_hidden_states = outputs[-2], outputs[-1]
-
-            self.assertEqual(
-                len(language_hidden_states),
-                self.model_tester.num_hidden_layers["language"] + 1,
-            )
-            self.assertEqual(
-                len(vision_hidden_states),
-                self.model_tester.num_hidden_layers["vision"] + 1,
-            )
-
-            seq_length = self.model_tester.seq_length
-            num_visual_features = self.model_tester.num_visual_features
-
-            self.assertListEqual(
-                list(language_hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-            self.assertListEqual(
-                list(vision_hidden_states[0].shape[-2:]),
-                [num_visual_features, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    @unittest.skip("MindSpore has no .grad")
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        inputs = self._prepare_for_class(inputs_dict, model_class)
-
-        outputs = model(**inputs)
-
-        hidden_states_lang = outputs.language_hidden_states[0]
-        attentions_lang = outputs.language_attentions[0]
-
-        hidden_states_vision = outputs.vision_hidden_states[0]
-        attentions_vision = outputs.vision_attentions[0]
-
-        outputs.language_output.flatten()[0].backward(retain_graph=True)
-        outputs.vision_output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states_lang.grad)
-        self.assertIsNotNone(attentions_vision.grad)
-        self.assertIsNotNone(hidden_states_vision.grad)
-        self.assertIsNotNone(attentions_vision.grad)
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
-
-@require_mindspore
-class LxmertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        model = LxmertModel.from_pretrained("unc-nlp/lxmert-base-uncased", from_pt=True)
-        input_ids = ms.tensor(
-            [[101, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 102]]
-        )
-        num_visual_features = 10
-        _, visual_feats = np.random.seed(0), np.random.rand(
-            1, num_visual_features, model.config.visual_feat_dim
-        )
-        _, visual_pos = np.random.seed(0), np.random.rand(1, num_visual_features, 4)
-
-        def as_tensor(value, dtype=None):
-            if isinstance(value, list) and isinstance(value[0], np.ndarray):
-                return ms.tensor(np.array(value), dtype)
-            if isinstance(value, np.ndarray) and value.shape == (0,):
-                return ms.tensor(
-                    ms._c_expression.Tensor(value, dtype)
-                )  # pylint: disable=c-extension-no-member
-            return ms.tensor(value, dtype)
-
-        visual_feats = as_tensor(visual_feats, dtype=ms.float32)
-        visual_pos = as_tensor(visual_pos, dtype=ms.float32)
-        output = model(input_ids, visual_feats=visual_feats, visual_pos=visual_pos)[0]
-        expected_shape = (1, 11, 768)
-        self.assertEqual(expected_shape, output.shape)
-        expected_slice = ms.tensor(
-            [
-                [
-                    [0.2417, -0.9807, 0.1480],
-                    [1.2541, -0.8320, 0.5112],
-                    [1.4070, -1.1052, 0.6990],
-                ]
-            ]
-        )
-
-        self.assertTrue(
-            np.allclose(
-                output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4
-            )
-        )
diff --git a/tests/transformers/models/m2m_100/__init__.py b/tests/transformers/models/m2m_100/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/m2m_100/test_modeling_m2m_100.py b/tests/transformers/models/m2m_100/test_modeling_m2m_100.py
deleted file mode 100644
index 381add627..000000000
--- a/tests/transformers/models/m2m_100/test_modeling_m2m_100.py
+++ /dev/null
@@ -1,453 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch M2M100 model."""
-
-import copy
-import tempfile
-import unittest
-import numpy as np
-import pytest
-from mindnlp.utils import is_mindspore_available
-from mindnlp.transformers import M2M100Config
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-from mindnlp.utils import cached_property
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-    from mindnlp.transformers import M2M100ForConditionalGeneration, M2M100Model, M2M100Tokenizer
-    from mindnlp.transformers.models.m2m_100.modeling_m2m_100 import M2M100Decoder, M2M100Encoder
-
-
-def prepare_m2m_100_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(config.encoder_layers, config.encoder_attention_heads)
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-class M2M100ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="relu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        encoder_layerdrop=0.0,
-        decoder_layerdrop=0.0,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.encoder_layerdrop = encoder_layerdrop
-        self.decoder_layerdrop = decoder_layerdrop
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        # we need to clamp the input ids here to avoid having pad token in between
-        # this is because for M2M100 the position_ids are prepared such that
-        # all pad tokens have pos id = 2 and rest are between 2..seq_length
-        # and the seq_length here is seq_length - num_pad_tokens
-        # but when using past, there is no way of knowing if the past input ids had
-        # pad tokens in them, which results in incorrect seq_lenth and which in turn results in
-        # position_ids being off by num_pad_tokens in past input
-        input_ids = input_ids.clamp(self.pad_token_id + 1)
-        decoder_input_ids = decoder_input_ids.clamp(self.pad_token_id + 1)
-
-        config = self.get_config()
-        inputs_dict = prepare_m2m_100_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def get_config(self):
-        return M2M100Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            encoder_layerdrop=self.encoder_layerdrop,
-            decoder_layerdrop=self.decoder_layerdrop,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = M2M100Model(config=config).get_decoder().eval()
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-        head_mask = inputs_dict["head_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2).to(attention_mask.dtype)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = M2M100Model(config=config).set_train(False)
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = M2M100Encoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
-            0
-        ]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = M2M100Decoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-@require_mindspore
-class M2M100ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            M2M100Model,
-            M2M100ForConditionalGeneration,
-        )# M2M100Model,
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (M2M100ForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": M2M100Model,
-            "summarization": M2M100ForConditionalGeneration,
-            "text2text-generation": M2M100ForConditionalGeneration,
-            "translation": M2M100ForConditionalGeneration,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    fx_compatible = True
-    test_pruning = False
-    test_missing_keys = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "TranslationPipelineTests":
-            # Get `ValueError: Translation requires a `src_lang` and a `tgt_lang` for this model`.
-            # `M2M100Config` was never used in pipeline tests: cannot create a simple tokenizer.
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = M2M100ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=M2M100Config)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (M2M100Model, M2M100ForConditionalGeneration):
-            model = model_class(config)
-            model.set_train(False)
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-            
-            model(**inputs)[0]
-
-    #@require_mindspore_fp16
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = M2M100ForConditionalGeneration(config).set_train(False)
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-
-def _long_tensor(tok_lst):
-    return mindspore.tensor(tok_lst, dtype=mindspore.int64)
-
-
-TOLERANCE = 1e-4
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-@slow
-class M2M100ModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_tokenizer(self):
-        return M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
-
-    def test_inference_no_head(self):
-        model = M2M100Model.from_pretrained("facebook/m2m100_418M")
-        input_ids = _long_tensor([[128028, 98, 12, 30527, 2732, 159, 7755, 61904, 39144, 38, 2]])
-        decoder_input_ids = _long_tensor([[2, 128028, 98, 12, 30527, 2732, 159, 7755, 61904, 39144, 38]])
-        inputs_dict = prepare_m2m_100_inputs_dict(model.config, input_ids, decoder_input_ids)
-        
-        output = model(**inputs_dict)[0]
-        expected_shape = (1, 11, 1024)
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = mindspore.tensor(
-            [[-0.7780, -0.1676, 0.1038], [-6.7556, -1.3992, 0.0567], [-7.5383, -0.5920, -0.2779]]
-        )
-        self.assertTrue(np.allclose(output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=TOLERANCE))
-
-    def test_inference_head(self):
-        model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-
-        # change to intended input
-        input_ids = _long_tensor([[128028, 98, 12, 30527, 2732, 159, 7755, 61904, 39144, 38, 2]])
-        decoder_input_ids = _long_tensor([[2, 128028, 98, 12, 30527, 2732, 159, 7755, 61904, 39144, 38]])
-        inputs_dict = prepare_m2m_100_inputs_dict(model.config, input_ids, decoder_input_ids)
-        output = model(**inputs_dict)[0]
-        expected_shape = (1, 11, model.config.vocab_size)
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = mindspore.tensor(
-            [[-1.0448, -1.0411, 3.7992], [-3.2191, -3.2386, -1.3451], [-3.6210, -3.5993, 0.4925]]
-        )
-        self.assertTrue(np.allclose(output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=TOLERANCE))
-
-    def test_seq_to_seq_generation(self):
-        model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-        tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="fr", tgt_lang="en")
-
-        src_fr = [
-            "L'affaire NSA souligne l'absence totale de débat sur le renseignement",
-            "Selon moi, il y a deux niveaux de réponse de la part du gouvernement français.",
-            "Lorsque François Hollande téléphone à Barack Obama ou quand le ministre des affaires étrangères Laurent"
-            " Fabius convoque l'ambassadeur des Etats-Unis, ils réagissent à une vraie découverte, qui est celle de"
-            " l'ampleur de la surveillance américaine sur l'ensemble des communications en France.",
-        ]
-
-        # The below article tests that we don't add any hypotheses outside of the top n_beams
-        dct = tokenizer(src_fr, padding=True, return_tensors="ms")
-
-        hypotheses_batch = model.generate(
-            input_ids=dct["input_ids"],
-            attention_mask=dct["attention_mask"],
-            num_beams=5,
-            forced_bos_token_id=tokenizer.get_lang_id("en"),
-        )
-
-        expected_en = [
-            "The NSA case highlights the total absence of intelligence debate",
-            "I think there are two levels of response from the French government.",
-            "When François Hollande calls Barack Obama or when Foreign Minister Laurent Fabius calls the U.S."
-            " Ambassador, they respond to a real discovery, which is that of the scale of U.S. surveillance on all"
-            " communications in France.",
-        ]
-
-        generated = tokenizer.batch_decode(
-            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )
-        assert generated == expected_en
-
-    # @require_flash_attn
-    # @require_mindspore_gpu
-    # @pytest.mark.flash_attn_test
-    # @slow
-    # def test_flash_attn_2_seq_to_seq_generation(self):
-    #     """
-    #     Overwritting the common test as the test is flaky on tiny models
-    #     """
-    #     model = M2M100ForConditionalGeneration.from_pretrained(
-    #         "facebook/m2m100_418M", attn_implementation="flash_attention_2"
-    #     ).to(torch_device)
-
-    #     tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="fr", tgt_lang="en")
-
-    #     src_fr = [
-    #         "L'affaire NSA souligne l'absence totale de débat sur le renseignement",
-    #         "Selon moi, il y a deux niveaux de réponse de la part du gouvernement français.",
-    #         "Lorsque François Hollande téléphone à Barack Obama ou quand le ministre des affaires étrangères Laurent"
-    #         " Fabius convoque l'ambassadeur des Etats-Unis, ils réagissent à une vraie découverte, qui est celle de"
-    #         " l'ampleur de la surveillance américaine sur l'ensemble des communications en France.",
-    #     ]
-
-    #     # The below article tests that we don't add any hypotheses outside of the top n_beams
-    #     dct = tokenizer(src_fr, padding=True, return_tensors="ms")
-
-    #     hypotheses_batch = model.generate(
-    #         input_ids=dct["input_ids"].to(torch_device),
-    #         attention_mask=dct["attention_mask"].to(torch_device),
-    #         num_beams=5,
-    #         forced_bos_token_id=tokenizer.get_lang_id("en"),
-    #     )
-
-    #     expected_en = [
-    #         "The NSA case highlights the total absence of intelligence debate",
-    #         "I think there are two levels of response from the French government.",
-    #         "When François Hollande calls Barack Obama or when Foreign Minister Laurent Fabius calls the U.S."
-    #         " Ambassador, they respond to a real discovery, which is that of the scale of U.S. surveillance on all"
-    #         " communications in France.",
-    #     ]
-
-    #     generated = tokenizer.batch_decode(
-    #         hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
-    #     )
-    #     assert generated == expected_en
diff --git a/tests/transformers/models/m2m_100/test_tokenization_m2m_100.py b/tests/transformers/models/m2m_100/test_tokenization_m2m_100.py
deleted file mode 100644
index 6320c9092..000000000
--- a/tests/transformers/models/m2m_100/test_tokenization_m2m_100.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-import unittest
-from pathlib import Path
-from shutil import copyfile
-from mindnlp.transformers import M2M100Tokenizer
-from mindnlp.utils import  is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    get_tests_dir,
-    nested_simplify,
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-from mindnlp.utils import is_sentencepiece_available
-
-
-if is_sentencepiece_available():
-    from mindnlp.transformers.models.m2m_100.tokenization_m2m_100 import VOCAB_FILES_NAMES, save_json
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-if is_sentencepiece_available():
-    SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model")
-
-
-if is_mindspore_available():
-    from mindnlp.transformers.models.m2m_100.modeling_m2m_100 import shift_tokens_right
-
-EN_CODE = 128022
-FR_CODE = 128028
-
-
-@require_sentencepiece
-class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "facebook/m2m100_418M"
-    tokenizer_class = M2M100Tokenizer
-    test_rust_tokenizer = False
-    test_seq2seq = False
-    test_sentencepiece = True
-
-    def setUp(self):
-        super().setUp()
-
-        vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        save_dir = Path(self.tmpdirname)
-        save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
-        if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
-            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
-
-        tokenizer = M2M100Tokenizer.from_pretrained(self.tmpdirname)
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_tokenizer(self, **kwargs):
-        return M2M100Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        return (
-            "This is a test",
-            "This is a test",
-        )
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = "</s>"
-        token_id = 0
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        tokenizer = self.get_tokenizer()
-        vocab_keys = list(tokenizer.get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "</s>")
-        self.assertEqual(vocab_keys[1], "<unk>")
-        self.assertEqual(vocab_keys[-1], "<s>")
-        # The length of the vocab keys can be different
-        # self.assertEqual(len(vocab_keys), tokenizer.vocab_size)
-
-    def test_full_tokenizer(self):
-        tokenizer = self.get_tokenizer()
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
-
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens),
-            [2, 3, 4, 5, 6],
-        )
-
-        back_tokens = tokenizer.convert_ids_to_tokens([2, 3, 4, 5, 6])
-        self.assertListEqual(back_tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
-
-        text = tokenizer.convert_tokens_to_string(tokens)
-        self.assertEqual(text, "This is a test")
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[128022, 110108, 397, 11, 38272, 2247, 124811, 285, 18105, 1586, 207, 7, 39534, 4428, 397, 1019, 18105, 1586, 207, 7, 41337, 16786, 241, 7, 20214, 17, 125690, 10398, 7, 44378, 58069, 68342, 7798, 7343, 11, 299, 33310, 4, 158, 37350, 94077, 4569, 299, 33310, 90, 4, 52840, 290, 4, 31270, 112, 299, 682, 4, 52840, 39953, 14079, 193, 52519, 90894, 17894, 120697, 11, 40445, 551, 17, 1019, 52519, 90894, 17756, 963, 11, 40445, 480, 17, 9792, 1120, 5173, 1393, 6240, 16786, 241, 120996, 28, 1245, 1393, 118240, 11123, 1019, 93612, 2691, 10618, 98058, 120409, 1928, 279, 4, 40683, 367, 178, 207, 1019, 103, 103121, 506, 65296, 5, 2], [128022, 21217, 367, 117, 125450, 128, 719, 7, 7308, 40, 93612, 12669, 1116, 16704, 71, 17785, 3699, 15592, 35, 144, 9584, 241, 11943, 713, 950, 799, 2247, 88427, 150, 149, 118813, 120706, 1019, 106906, 81518, 28, 1224, 22799, 397, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [128022, 1658, 123311, 5155, 5578, 4722, 279, 14947, 2366, 1120, 1197, 14, 1348, 9232, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="facebook/m2m100_418M",
-            revision="c168bae485c864188cf9aa0e4108b0b6934dc91e",
-        )
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class M2M100TokenizerIntegrationTest(unittest.TestCase):
-    checkpoint_name = "facebook/m2m100_418M"
-    src_text = [
-        "In my opinion, there are two levels of response from the French government.",
-        "NSA Affair Emphasizes Complete Lack of Debate on Intelligence",
-    ]
-    tgt_text = [
-        "Selon moi, il y a deux niveaux de réponse de la part du gouvernement français.",
-        "L'affaire NSA souligne l'absence totale de débat sur le renseignement",
-    ]
-
-    expected_src_tokens = [EN_CODE, 593, 1949, 115781, 4, 71586, 4234, 60633, 126233, 432, 123808, 15592, 1197, 117132, 120618, 5, 2]  # fmt: skip
-
-    @classmethod
-    def setUpClass(cls):
-        cls.tokenizer: M2M100Tokenizer = M2M100Tokenizer.from_pretrained(
-            cls.checkpoint_name, src_lang="en", tgt_lang="fr"
-        )
-        cls.pad_token_id = 1
-        return cls
-
-    def check_language_codes(self):
-        self.assertEqual(self.tokenizer.get_lang_id("ar"), 128006)
-        self.assertEqual(self.tokenizer.get_lang_id("en"), 128022)
-        self.assertEqual(self.tokenizer.get_lang_id("ro"), 128076)
-        self.assertEqual(self.tokenizer.get_lang_id("mr"), 128063)
-
-    def test_get_vocab(self):
-        vocab = self.tokenizer.get_vocab()
-        self.assertEqual(len(vocab), len(self.tokenizer))
-        self.assertEqual(vocab["<unk>"], 3)
-        self.assertIn(self.tokenizer.get_lang_token("en"), vocab)
-
-    def test_tokenizer_batch_encode_plus(self):
-        self.tokenizer.src_lang = "en"
-        ids = self.tokenizer.batch_encode_plus(self.src_text).input_ids[0]
-        self.assertListEqual(self.expected_src_tokens, ids)
-
-    def test_tokenizer_decode_ignores_language_codes(self):
-        self.assertIn(FR_CODE, self.tokenizer.all_special_ids)
-        generated_ids = [FR_CODE, 5364, 82, 8642, 4, 294, 47, 8, 14028, 136, 3286, 9706, 6, 90797, 6, 144012, 162, 88128, 30061, 5, 2]  # fmt: skip
-        result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
-        expected_french = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
-        self.assertEqual(result, expected_french)
-        self.assertNotIn(self.tokenizer.eos_token, result)
-
-    def test_special_tokens_unaffacted_by_save_load(self):
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            original_special_tokens = self.tokenizer.lang_token_to_id
-            self.tokenizer.save_pretrained(tmpdirname)
-            new_tok = M2M100Tokenizer.from_pretrained(tmpdirname)
-            self.assertDictEqual(new_tok.lang_token_to_id, original_special_tokens)
-
-    @require_mindspore
-    def test_batch_fairseq_parity(self):
-        self.tokenizer.src_lang = "en"
-        self.tokenizer.tgt_lang = "fr"
-
-        batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="ms")
-
-        batch["decoder_input_ids"] = shift_tokens_right(
-            batch["labels"], self.tokenizer.pad_token_id, self.tokenizer.eos_token_id
-        )
-
-        for k in batch:
-            batch[k] = batch[k].tolist()
-        # batch = {k: v.tolist() for k,v in batch.items()}
-        # fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4
-        # batch.decoder_inputs_ids[0][0] ==
-        assert batch.input_ids[1][0] == EN_CODE
-        assert batch.input_ids[1][-1] == 2
-        assert batch.labels[1][0] == FR_CODE
-        assert batch.labels[1][-1] == 2
-        assert batch.decoder_input_ids[1][:2] == [2, FR_CODE]
-
-    @require_mindspore
-    def test_src_lang_setter(self):
-        self.tokenizer.src_lang = "mr"
-        self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("mr")])
-        self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
-
-        self.tokenizer.src_lang = "zh"
-        self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("zh")])
-        self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
-
-    @require_mindspore
-    def test_tokenizer_target_mode(self):
-        self.tokenizer.tgt_lang = "mr"
-        self.tokenizer._switch_to_target_mode()
-        self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("mr")])
-        self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
-        self.tokenizer._switch_to_input_mode()
-        self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id(self.tokenizer.src_lang)])
-
-        self.tokenizer.tgt_lang = "zh"
-        self.tokenizer._switch_to_target_mode()
-        self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("zh")])
-        self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
-        self.tokenizer._switch_to_input_mode()
-        self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id(self.tokenizer.src_lang)])
-
-    @require_mindspore
-    def test_tokenizer_translation(self):
-        inputs = self.tokenizer._build_translation_inputs("A test", return_tensors="ms", src_lang="en", tgt_lang="ar")
-
-        self.assertEqual(
-            nested_simplify(inputs),
-            {
-                # en_XX, A, test, EOS
-                "input_ids": [[128022, 58, 4183, 2]],
-                "attention_mask": [[1, 1, 1, 1]],
-                # ar_AR
-                "forced_bos_token_id": 128006,
-            },
-        )
diff --git a/tests/transformers/models/mamba/__init__.py b/tests/transformers/models/mamba/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mamba/test_modeling_graph_mamba.py b/tests/transformers/models/mamba/test_modeling_graph_mamba.py
deleted file mode 100644
index 55c2134ae..000000000
--- a/tests/transformers/models/mamba/test_modeling_graph_mamba.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import math
-import unittest
-from typing import Dict, List, Tuple
-from unittest.util import safe_repr
-
-from parameterized import parameterized
-
-import numpy as np
-from mindnlp.transformers import AutoTokenizer, MambaConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        MSMambaForCausalLM as MambaForCausalLM,
-        MSMambaModel as MambaModel,
-    )
-
-
-@require_mindspore
-class MambaIntegrationTests(unittest.TestCase):
-    def setUp(self):
-        self.model_id = "state-spaces/mamba-2.8b-hf"
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
-
-    @slow
-    def test_simple_generate(self):
-        tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf", ms_dtype=mindspore.float16)
-        model.config.use_cache = True
-        input_ids = tokenizer("Hey how are you doing?", return_tensors="ms")["input_ids"]
-
-        out = model.generate(input_ids, do_sample=False, max_new_tokens=10)
-        output_sentence = tokenizer.decode(out[0, :])
-        self.assertEqual(output_sentence, "Hey how are you doing?\n\nI'm so glad you're here.")
-
-        model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf", ms_dtype=mindspore.float16)
-        logits = model(input_ids=input_ids).logits
-
-        EXPECTED_LOGITS_NO_GRAD = mindspore.tensor(
-            [
-                -55.6875, -69.8750, -49.9062, -51.7500, -57.6875, -57.9375, -56.9688,
-                -57.9375, -54.6875, -55.9375, -55.3125, -58.0938, -60.5625, -47.0000,
-                -52.0312, -49.7812, -55.9375, -57.9062, -56.7812, -57.1250, -57.3438,
-                -58.3125, -57.8125, -58.7812, -59.6250, -59.0938, -58.7188, -52.9375,
-                -53.4688, -57.3750, -56.9375, -55.7500, -53.3125, -55.8438, -57.0000,
-                -56.9062, -56.2188, -54.7188, -56.4375, -57.5000
-            ], dtype=mindspore.float32)  # fmt: skip
-
-        self.assertTrue(np.allclose(logits[0, 0, :40].asnumpy(), EXPECTED_LOGITS_NO_GRAD.asnumpy(), rtol=1e-2, atol=1e-2))
-
-    @slow
-    def test_simple_generate_cuda_kernels_tiny(self):
-        expected_output = "Hello my name is John and I am a newbie to the world"
-
-        input_ids = self.tokenizer("Hello my name is", return_tensors="ms").input_ids
-        model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf", ms_dtype=mindspore.float16)
-
-        output = model.generate(input_ids, max_new_tokens=10)
-        output_sentence = self.tokenizer.decode(output[0].tolist())
-
-        self.assertEqual(output_sentence, expected_output)
-
-    @slow
-    def test_simple_generate_cuda_kernels_small(self):
-        expected_output = "Hello my name is\n\nI am a\n\nI am a"
-
-        input_ids = self.tokenizer("Hello my name is", return_tensors="ms").input_ids
-        model = MambaForCausalLM.from_pretrained("state-spaces/mamba-790m-hf", ms_dtype=mindspore.float16)
-
-        output = model.generate(input_ids, max_new_tokens=10)
-        output_sentence = self.tokenizer.decode(output[0].tolist())
-
-        self.assertEqual(output_sentence, expected_output)
-
-    @slow
-    def test_simple_generate_cuda_kernels_mid(self):
-        expected_output = "Hello my name is John and I am a\n\nI am a single father of a beautiful daughter. I am a"
-
-        input_ids = self.tokenizer("Hello my name is", return_tensors="ms").input_ids
-        model = MambaForCausalLM.from_pretrained("state-spaces/mamba-1.4b-hf", ms_dtype=mindspore.float16)
-
-        output = model.generate(input_ids, max_new_tokens=20)
-        output_sentence = self.tokenizer.decode(output[0].tolist())
-
-        self.assertEqual(output_sentence, expected_output)
-
-    @slow
-    def test_simple_generate_cuda_kernels_big(self):
-        expected_output = "Hello my name is John and I am a new member of this forum. I am a retired Marine and I am a member of the Marine Corps League. I am a"
-
-        input_ids = self.tokenizer("Hello my name is", return_tensors="ms").input_ids
-        model = MambaForCausalLM.from_pretrained("state-spaces/mamba-2.8b-hf", ms_dtype=mindspore.float16)
-
-        output = model.generate(input_ids, max_new_tokens=30)
-        output_sentence = self.tokenizer.decode(output[0].tolist())
-
-        self.assertEqual(output_sentence, expected_output)
diff --git a/tests/transformers/models/mamba/test_modeling_mamba.py b/tests/transformers/models/mamba/test_modeling_mamba.py
deleted file mode 100644
index 677779966..000000000
--- a/tests/transformers/models/mamba/test_modeling_mamba.py
+++ /dev/null
@@ -1,502 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import math
-import unittest
-from typing import Dict, List, Tuple
-from unittest.util import safe_repr
-
-from parameterized import parameterized
-
-from mindnlp.transformers import AutoTokenizer, MambaConfig, is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, nn, no_grad
-
-    from mindnlp.transformers import (
-        MambaForCausalLM,
-        MambaModel,
-    )
-    from mindnlp.transformers.models.mamba.modeling_mamba import MambaCache
-
-
-class MambaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        intermediate_size=32,
-        hidden_act="silu",
-        hidden_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        tie_word_embeddings=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-        self.tie_word_embeddings = tie_word_embeddings
-
-    def get_large_model_config(self):
-        return MambaConfig.from_pretrained("hf-internal-testing/mamba-2.8b")
-
-    def prepare_config_and_inputs(
-        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
-    ):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        attention_mask = ids_tensor([self.batch_size, self.seq_length], 1)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config(
-            gradient_checkpointing=gradient_checkpointing,
-            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-            reorder_and_upcast_attn=reorder_and_upcast_attn,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(
-        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
-    ):
-        return MambaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            intermediate_size=self.intermediate_size,
-            activation_function=self.hidden_act,
-            n_positions=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            gradient_checkpointing=gradient_checkpointing,
-            tie_word_embeddings=self.tie_word_embeddings,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            attention_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def create_and_check_mamba_model(self, config, input_ids, *args):
-        config.output_hidden_states = True
-        model = MambaModel(config=config)
-        model.eval()
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(len(result.hidden_states), config.num_hidden_layers + 1)
-
-    def create_and_check_causal_lm(self, config, input_ids, *args):
-        model = MambaForCausalLM(config)
-        model.eval()
-
-        result = model(input_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_state_equivalency(self, config, input_ids, *args):
-        model = MambaModel(config=config)
-        model.eval()
-
-        outputs = model(input_ids)
-        output_whole = outputs.last_hidden_state
-
-        outputs = model(
-            input_ids[:, :-1],
-            use_cache=True,
-            cache_position=ops.arange(0, config.conv_kernel),
-        )
-        output_one = outputs.last_hidden_state
-
-        # Using the state computed on the first inputs, we will get the same output
-        outputs = model(
-            input_ids[:, -1:],
-            use_cache=True,
-            cache_params=outputs.cache_params,
-            cache_position=ops.arange(config.conv_kernel, config.conv_kernel + 1),
-        )
-        output_two = outputs.last_hidden_state
-
-        self.parent.assertTrue(ops.allclose(ops.cat([output_one, output_two], dim=1), output_whole, atol=1e-3))
-        # TODO the orignal mamba does not support decoding more than 1 token neither do we
-
-    def create_and_check_mamba_cached_slow_forward_and_backwards(
-        self, config, input_ids, *args, gradient_checkpointing=False
-    ):
-        model = MambaModel(config)
-        if gradient_checkpointing:
-            model.gradient_checkpointing_enable()
-
-        # create cache
-        cache = model(input_ids, use_cache=True).cache_params
-        cache.reset()
-
-        # use cache
-        token_emb = model.embeddings(input_ids)
-        outputs = model.layers[0].mixer.slow_forward(
-            token_emb, cache, cache_position=ops.arange(0, config.conv_kernel)
-        )
-
-        loss = ops.log(1 + ops.abs(outputs.sum()))
-        self.parent.assertEqual(loss.shape, ())
-        self.parent.assertEqual(outputs.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        loss.backward()
-
-    def create_and_check_mamba_lm_head_forward_and_backwards(
-        self, config, input_ids, *args, gradient_checkpointing=False
-    ):
-        model = MambaForCausalLM(config)
-        if gradient_checkpointing:
-            model.gradient_checkpointing_enable()
-
-        result = model(input_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        result.loss.backward()
-
-    def prepare_config_and_inputs_for_common(self):
-        (
-            config,
-            input_ids,
-            attention_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class MambaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (MambaModel, MambaForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (MambaForCausalLM,) if is_mindspore_available() else ()
-    has_attentions = False  # Mamba does not support attentions
-    fx_compatible = False  # FIXME let's try to support this @ArthurZucker
-    test_missing_keys = False
-    test_model_parallel = False
-    test_pruning = False
-    test_head_masking = False  # Mamba does not have attention heads
-    pipeline_model_mapping = (
-        {"feature-extraction": MambaModel, "text-generation": MambaForCausalLM} if is_mindspore_available() else {}
-    )
-
-    def setUp(self):
-        self.model_tester = MambaModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=MambaConfig, n_embd=37, common_properties=["hidden_size", "num_hidden_layers"]
-        )
-
-    def assertInterval(self, member, container, msg=None):
-        r"""
-        Simple utility function to check if a member is inside an interval.
-        """
-        if isinstance(member, mindspore.Tensor):
-            max_value, min_value = member.max().item(), member.min().item()
-        elif isinstance(member, list) or isinstance(member, tuple):
-            max_value, min_value = max(member), min(member)
-
-        if not isinstance(container, list):
-            raise TypeError("container should be a list or tuple")
-        elif len(container) != 2:
-            raise ValueError("container should have 2 elements")
-
-        expected_min, expected_max = container
-
-        is_inside_interval = (min_value >= expected_min) and (max_value <= expected_max)
-
-        if not is_inside_interval:
-            standardMsg = "%s not found in %s" % (safe_repr(member), safe_repr(container))
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_mamba_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mamba_model(*config_and_inputs)
-
-    def test_mamba_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm(*config_and_inputs)
-
-    def test_state_equivalency(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_state_equivalency(*config_and_inputs)
-
-    def test_mamba_cached_slow_forward_and_backwards(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mamba_cached_slow_forward_and_backwards(*config_and_inputs)
-
-    def test_mamba_lm_head_forward_and_backwards(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mamba_lm_head_forward_and_backwards(*config_and_inputs)
-
-    def test_initialization(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            for name, param in model.named_parameters():
-                if "dt_proj.bias" in name:
-                    dt = ops.exp(
-                        mindspore.tensor([0, 1]) * (math.log(config.time_step_max) - math.log(config.time_step_min))
-                        + math.log(config.time_step_min)
-                    ).clamp(min=config.time_step_floor)
-                    inv_dt = dt + ops.log(-ops.expm1(-dt))
-                    if param.requires_grad:
-                        self.assertTrue(param.data.max().item() <= inv_dt[1])
-                        self.assertTrue(param.data.min().item() >= inv_dt[0])
-                elif "A_log" in name:
-                    A = ops.arange(1, config.state_size + 1, dtype=mindspore.float32)[None, :]
-                    self.assertTrue(ops.allclose(param.data, ops.log(A), atol=1e-5, rtol=1e-5))
-                elif "D" in name:
-                    if param.requires_grad:
-                        # check if it's a ones like
-                        self.assertTrue(ops.allclose(param.data, ops.ones_like(param.data), atol=1e-5, rtol=1e-5))
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = MambaModel.from_pretrained("hf-internal-testing/mamba-130m")
-        self.assertIsNotNone(model)
-
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            with no_grad():
-                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
-                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-                def recursive_check(tuple_object, dict_object):
-                    if isinstance(tuple_object, MambaCache):  # MODIFIED PART START
-                        recursive_check(tuple_object.conv_states, dict_object.conv_states)
-                        recursive_check(tuple_object.ssm_states, dict_object.ssm_states)
-                    elif isinstance(tuple_object, (List, Tuple)):  # MODIFIED PART END
-                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                            recursive_check(tuple_iterable_value, dict_iterable_value)
-                    elif isinstance(tuple_object, Dict):
-                        for tuple_iterable_value, dict_iterable_value in zip(
-                            tuple_object.values(), dict_object.values()
-                        ):
-                            recursive_check(tuple_iterable_value, dict_iterable_value)
-                    elif tuple_object is None:
-                        return
-                    else:
-                        self.assertTrue(
-                            ops.allclose(tuple_object, dict_object, atol=1e-5),
-                            msg=(
-                                "Tuple and dict output are not equal. Difference:"
-                                f" {ops.max(ops.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                                f" {ops.isnan(tuple_object).any()} and `inf`: {ops.isinf(tuple_object)}. Dict has"
-                                f" `nan`: {ops.isnan(dict_object).any()} and `inf`: {ops.isinf(dict_object)}."
-                            ),
-                        )
-
-                recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-    @unittest.skip("The `input_embeds` when fed don't produce the same results.")
-    def test_beam_sample_generate(self):
-        pass
-
-
-@require_mindspore
-class MambaIntegrationTests(unittest.TestCase):
-    def setUp(self):
-        self.model_id = "state-spaces/mamba-2.8b-hf"
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
-
-    @slow
-    def test_simple_generate(self):
-        tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf", ms_dtype=mindspore.float16)
-        input_ids = tokenizer("Hey how are you doing?", return_tensors="ms")["input_ids"]
-
-        out = model.generate(input_ids, do_sample=False, use_cache=True, max_new_tokens=10)
-        output_sentence = tokenizer.decode(out[0, :])
-        self.assertEqual(output_sentence, "Hey how are you doing?\n\nI'm so glad you're here.")
-
-        with no_grad():
-            logits = model(input_ids=input_ids).logits
-
-        EXPECTED_LOGITS_NO_GRAD = mindspore.tensor(
-            [
-                -55.6875, -69.8750, -49.9062, -51.7500, -57.6875, -57.9375, -56.9688,
-                -57.9375, -54.6875, -55.9375, -55.3125, -58.0938, -60.5625, -47.0000,
-                -52.0312, -49.7812, -55.9375, -57.9062, -56.7812, -57.1250, -57.3438,
-                -58.3125, -57.8125, -58.7812, -59.6250, -59.0938, -58.7188, -52.9375,
-                -53.4688, -57.3750, -56.9375, -55.7500, -53.3125, -55.8438, -57.0000,
-                -56.9062, -56.2188, -54.7188, -56.4375, -57.5000
-            ]
-        ,dtype=mindspore.float32)  # fmt: skip
-
-        assert ops.allclose(logits[0, 0, :40], EXPECTED_LOGITS_NO_GRAD, rtol=1e-3, atol=1e-3)
-
-    def test_simple_generate_cuda_kernels_tiny(self):
-        expected_output = "Hello my name is John and I am a newbie to the world"
-
-        input_ids = self.tokenizer("Hello my name is", return_tensors="ms").input_ids
-        model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf", ms_dtype=mindspore.float16)
-
-        output = model.generate(input_ids, max_new_tokens=10)
-        output_sentence = self.tokenizer.decode(output[0].tolist())
-
-        self.assertEqual(output_sentence, expected_output)
-
-    @slow
-    def test_simple_generate_cuda_kernels_small(self):
-        expected_output = "Hello my name is\n\nI am a\n\nI am a"
-
-        input_ids = self.tokenizer("Hello my name is", return_tensors="ms")
-        model = MambaForCausalLM.from_pretrained("state-spaces/mamba-790m-hf", ms_dtype=mindspore.float16)
-
-        output = model.generate(input_ids, max_new_tokens=10)
-        output_sentence = self.tokenizer.decode(output[0].tolist())
-
-        self.assertEqual(output_sentence, expected_output)
-
-    @slow
-    def test_simple_generate_cuda_kernels_mid(self):
-        expected_output = "Hello my name is John and I am a\n\nI am a single father of a beautiful daughter. I am a"
-
-        input_ids = self.tokenizer("Hello my name is", return_tensors="ms")
-        model = MambaForCausalLM.from_pretrained("state-spaces/mamba-1.4b-hf", ms_dtype=mindspore.float16)
-
-        output = model.generate(input_ids, max_new_tokens=20)
-        output_sentence = self.tokenizer.decode(output[0].tolist())
-
-        self.assertEqual(output_sentence, expected_output)
-
-    @slow
-    def test_simple_generate_cuda_kernels_big(self):
-        expected_output = "Hello my name is John and I am a new member of this forum. I am a retired Marine and I am a member of the Marine Corps League. I am a"
-
-        input_ids = self.tokenizer("Hello my name is", return_tensors="ms").input_ids
-        model = MambaForCausalLM.from_pretrained("state-spaces/mamba-2.8b-hf", ms_dtype=mindspore.float16)
-
-        output = model.generate(input_ids, max_new_tokens=30)
-        output_sentence = self.tokenizer.decode(output[0].tolist())
-
-        self.assertEqual(output_sentence, expected_output)
-
-    # @slow
-    # def test_compile_mamba_cache(self):
-    #     expected_output = "Hello my name is John and I am a\n\nI am a single father of a beautiful daughter. I am a"
-
-    #     input_ids = self.tokenizer("Hello my name is", return_tensors="ms").input_ids
-    #     model = MambaForCausalLM.from_pretrained("state-spaces/mamba-1.4b-hf", ms_dtype=mindspore.float16)
-
-    #     output = model.generate(input_ids, max_new_tokens=20, cache_implementation="mamba")
-    #     output_sentence = self.tokenizer.decode(output[0].tolist())
-    #     self.assertEqual(output_sentence, expected_output)
-
-    #     model.forward = torch.compile(model.forward, fullgraph=True, mode="reduce-overhead")
-    #     output = model.generate(input_ids, max_new_tokens=20, cache_implementation="mamba")
-    #     output_sentence = self.tokenizer.decode(output[0].tolist())
-    #     self.assertEqual(output_sentence, expected_output)
\ No newline at end of file
diff --git a/tests/transformers/models/marian/__init__.py b/tests/transformers/models/marian/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/marian/test_modeling_marian.py b/tests/transformers/models/marian/test_modeling_marian.py
deleted file mode 100644
index 99c96fc95..000000000
--- a/tests/transformers/models/marian/test_modeling_marian.py
+++ /dev/null
@@ -1,847 +0,0 @@
-# coding=utf-8
-# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Marian model."""
-
-import tempfile
-import unittest
-import numpy as np
-
-from mindnlp.transformers import MarianConfig
-from mindnlp.utils import is_mindspore_available,cached_property
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        AutoConfig,
-        AutoModelWithLMHead,
-        AutoTokenizer,
-        MarianModel,
-        MarianMTModel,
-    )
-    from mindnlp.transformers.models.marian.modeling_marian import (
-        MarianDecoder,
-        MarianEncoder,
-        MarianForCausalLM,
-        shift_tokens_right,
-    )
-
-
-def prepare_marian_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones((config.encoder_layers, config.encoder_attention_heads))
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones((config.decoder_layers, config.decoder_attention_heads))
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones((config.decoder_layers, config.decoder_attention_heads))
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-class MarianModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        decoder_start_token_id=3,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-
-        # forcing a certain token to be generated, sets all other tokens to -inf
-        # if however the token to be generated is already at -inf then it can lead token
-        # `nan` values and thus break generation
-        self.forced_bos_token_id = None
-        self.forced_eos_token_id = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-        inputs_dict = prepare_marian_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def get_config(self):
-        return MarianConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            forced_bos_token_id=self.forced_bos_token_id,
-            forced_eos_token_id=self.forced_eos_token_id,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = MarianModel(config=config).get_decoder()
-        model.set_train(False)
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-        head_mask = inputs_dict["head_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = ops.cat([attention_mask.to(next_attn_mask.dtype), next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_no_past_slice.stop_gradient = True
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-        output_from_past_slice.stop_gradient = True
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = MarianModel(config=config)
-        model.set_train(False)
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = MarianEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
-            0
-        ]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = MarianDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-@require_mindspore
-class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (MarianMTModel,) if is_mindspore_available() else ()
-    all_generative_model_classes = (MarianMTModel,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "conversational": MarianMTModel,
-            "summarization": MarianMTModel,
-            "text-generation": MarianForCausalLM,
-            "text2text-generation": MarianMTModel,
-            "translation": MarianMTModel,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    fx_compatible = True
-    test_pruning = False
-    test_missing_keys = False
-
-    def setUp(self):
-        self.model_tester = MarianModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MarianConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-
-    def test_share_encoder_decoder_embeddings(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-
-        # check if embeddings are shared by default
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIs(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
-            self.assertIs(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
-
-        # check if embeddings are not shared when config.share_encoder_decoder_embeddings = False
-        config.share_encoder_decoder_embeddings = False
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsNot(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
-            self.assertIsNot(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
-
-        # check if a model with shared embeddings can be saved and loaded with share_encoder_decoder_embeddings = False
-        config, _ = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, share_encoder_decoder_embeddings=False)
-                self.assertIsNot(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
-                self.assertIsNot(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
-
-    def test_resize_decoder_token_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs()
-
-        # check if resize_decoder_token_embeddings raises an error when embeddings are shared
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            with self.assertRaises(ValueError):
-                model.resize_decoder_token_embeddings(config.vocab_size + 1)
-
-        # check if decoder embeddings are resized when config.share_encoder_decoder_embeddings = False
-        config.share_encoder_decoder_embeddings = False
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.resize_decoder_token_embeddings(config.vocab_size + 1)
-            self.assertEqual(model.get_decoder().embed_tokens.weight.shape, (config.vocab_size + 1, config.d_model))
-
-        # check if lm_head is also resized
-        config, _ = self.model_tester.prepare_config_and_inputs()
-        config.share_encoder_decoder_embeddings = False
-        model = MarianMTModel(config)
-        model.resize_decoder_token_embeddings(config.vocab_size + 1)
-        self.assertEqual(model.lm_head.weight.shape, (config.vocab_size + 1, config.d_model))
-
-    def test_tie_word_embeddings_decoder(self):
-        pass
-
-    @unittest.skip("Skipping for now, to fix @ArthurZ or @ydshieh")
-    def test_pipeline_conversational(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
-
-def assert_tensors_close(a, b, atol=1e-12, prefix=""):
-    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if np.allclose(a.asnumpy(), b.asnumpy(), atol=atol):
-            return True
-        raise
-    except Exception:
-        pct_different = (ops.gt((a - b).abs(), atol)).float().mean().item()
-        if a.numel() > 100:
-            msg = f"tensor values are {pct_different:.1%} percent different."
-        else:
-            msg = f"{a} != {b}"
-        if prefix:
-            msg = prefix + ": " + msg
-        raise AssertionError(msg)
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class MarianIntegrationTest(unittest.TestCase):
-    src = "en"
-    tgt = "de"
-    src_text = [
-        "I am a small frog.",
-        "Now I can forget the 100 words of german that I know.",
-        "Tom asked his teacher for advice.",
-        "That's how I would do it.",
-        "Tom really admired Mary's courage.",
-        "Turn around and close your eyes.",
-    ]
-    expected_text = [
-        "Ich bin ein kleiner Frosch.",
-        "Jetzt kann ich die 100 Wörter des Deutschen vergessen, die ich kenne.",
-        "Tom bat seinen Lehrer um Rat.",
-        "So würde ich das machen.",
-        "Tom bewunderte Marias Mut wirklich.",
-        "Drehen Sie sich um und schließen Sie die Augen.",
-    ]
-    # ^^ actual C++ output differs slightly: (1) des Deutschen removed, (2) ""-> "O", (3) tun -> machen
-
-    @classmethod
-    def setUpClass(cls) -> None:
-        cls.model_name = f"Helsinki-NLP/opus-mt-{cls.src}-{cls.tgt}"
-        return cls
-
-    @cached_property
-    def tokenizer(self):
-        return AutoTokenizer.from_pretrained(self.model_name)
-
-    @property
-    def eos_token_id(self) -> int:
-        return self.tokenizer.eos_token_id
-
-    @cached_property
-    def model(self):
-        model: MarianMTModel = AutoModelWithLMHead.from_pretrained(self.model_name)
-        c = model.config
-        self.assertListEqual(c.bad_words_ids, [[c.pad_token_id]])
-        self.assertEqual(c.max_length, 512)
-        self.assertEqual(c.decoder_start_token_id, c.pad_token_id)
-        return model
-
-    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
-        generated_words = self.translate_src_text(**tokenizer_kwargs)
-        self.assertListEqual(self.expected_text, generated_words)
-
-    def translate_src_text(self, **tokenizer_kwargs):
-        model_inputs = self.tokenizer(self.src_text, padding=True, return_tensors="ms", **tokenizer_kwargs)
-        generated_ids = self.model.generate(
-            model_inputs.input_ids,
-            attention_mask=model_inputs.attention_mask,
-            num_beams=2,
-            max_length=128,
-            renormalize_logits=True,  # Marian should always renormalize its logits. See #25459
-        )
-        generated_words = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-        return generated_words
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_EN_DE_More(MarianIntegrationTest):
-    @slow
-    def test_forward(self):
-        src, tgt = ["I am a small frog"], ["Ich bin ein kleiner Frosch."]
-        expected_ids = [38, 121, 14, 697, 38848, 0]
-
-        model_inputs = self.tokenizer(src, text_target=tgt, return_tensors="ms")
-
-        self.assertListEqual(expected_ids, model_inputs.input_ids[0].tolist())
-
-        desired_keys = {
-            "input_ids",
-            "attention_mask",
-            "labels",
-        }
-        self.assertSetEqual(desired_keys, set(model_inputs.keys()))
-        model_inputs["decoder_input_ids"] = shift_tokens_right(
-            model_inputs.labels, self.tokenizer.pad_token_id, self.model.config.decoder_start_token_id
-        )
-        model_inputs["return_dict"] = True
-        model_inputs["use_cache"] = False
-        with mindspore._no_grad():
-            outputs = self.model(**model_inputs)
-        max_indices = outputs.logits.argmax(-1)
-        self.tokenizer.batch_decode(max_indices)
-
-    def test_unk_support(self):
-        t = self.tokenizer
-        ids = t(["||"], return_tensors="ms").input_ids[0].asnumpy().tolist()
-        expected = [t.unk_token_id, t.unk_token_id, t.eos_token_id]
-        self.assertEqual(expected, ids)
-
-    def test_pad_not_split(self):
-        input_ids_w_pad = self.tokenizer(["I am a small frog <pad>"], return_tensors="ms").input_ids[0].asnumpy().tolist()
-        expected_w_pad = [38, 121, 14, 697, 38848, self.tokenizer.pad_token_id, 0]  # pad
-        self.assertListEqual(expected_w_pad, input_ids_w_pad)
-
-    @slow
-    def test_batch_generation_en_de(self):
-        self._assert_generated_batch_equal_expected()
-
-    def test_auto_config(self):
-        config = AutoConfig.from_pretrained(self.model_name)
-        self.assertIsInstance(config, MarianConfig)
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_EN_FR(MarianIntegrationTest):
-    src = "en"
-    tgt = "fr"
-    src_text = [
-        "I am a small frog.",
-        "Now I can forget the 100 words of german that I know.",
-    ]
-    expected_text = [
-        "Je suis une petite grenouille.",
-        "Maintenant, je peux oublier les 100 mots d'allemand que je connais.",
-    ]
-
-    @slow
-    def test_batch_generation_en_fr(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_FR_EN(MarianIntegrationTest):
-    src = "fr"
-    tgt = "en"
-    src_text = [
-        "Donnez moi le micro.",
-        "Tom et Mary étaient assis à une table.",  # Accents
-    ]
-    expected_text = [
-        "Give me the microphone.",
-        "Tom and Mary were sitting at a table.",
-    ]
-
-    @slow
-    def test_batch_generation_fr_en(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_RU_FR(MarianIntegrationTest):
-    src = "ru"
-    tgt = "fr"
-    src_text = ["Он показал мне рукопись своей новой пьесы."]
-    expected_text = ["Il m'a montré le manuscrit de sa nouvelle pièce."]
-
-    @slow
-    def test_batch_generation_ru_fr(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_MT_EN(MarianIntegrationTest):
-    """Cover low resource/high perplexity setting. This breaks without adjust_logits_generation overwritten"""
-
-    src = "mt"
-    tgt = "en"
-    src_text = ["Billi messu b'mod ġentili, Ġesù fejjaq raġel li kien milqut bil - marda kerha tal - ġdiem."]
-    expected_text = ["Touching gently, Jesus healed a man who was affected by the sad disease of leprosy."]
-
-    @slow
-    def test_batch_generation_mt_en(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_en_zh(MarianIntegrationTest):
-    src = "en"
-    tgt = "zh"
-    src_text = ["My name is Wolfgang and I live in Berlin"]
-    expected_text = ["我叫沃尔夫冈 我住在柏林"]
-
-    @slow
-    def test_batch_generation_eng_zho(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_en_ROMANCE(MarianIntegrationTest):
-    """Multilingual on target side."""
-
-    src = "en"
-    tgt = "ROMANCE"
-    src_text = [
-        ">>fr<< Don't spend so much time watching TV.",
-        ">>pt<< Your message has been sent.",
-        ">>es<< He's two years older than me.",
-    ]
-    expected_text = [
-        "Ne passez pas autant de temps à regarder la télé.",
-        "A sua mensagem foi enviada.",
-        "Es dos años más viejo que yo.",
-    ]
-
-    @slow
-    def test_batch_generation_en_ROMANCE_multi(self):
-        self._assert_generated_batch_equal_expected()
-
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_FI_EN_V2(MarianIntegrationTest):
-    src = "fi"
-    tgt = "en"
-    src_text = [
-        "minä tykkään kirjojen lukemisesta",
-        "Pidän jalkapallon katsomisesta",
-    ]
-    expected_text = ["I like to read books", "I like watching football"]
-
-    @classmethod
-    def setUpClass(cls) -> None:
-        cls.model_name = "hf-internal-testing/test-opus-tatoeba-fi-en-v2"
-        return cls
-
-    @slow
-    def test_batch_generation_fi_en(self):
-        self._assert_generated_batch_equal_expected()
-
-
-class MarianStandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        d_model=16,
-        decoder_seq_length=7,
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=2,
-        decoder_ffn_dim=32,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=False,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.is_encoder_decoder = is_encoder_decoder
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = MarianConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_attention_heads=self.encoder_attention_heads,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        )
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = MarianDecoder(config=config)
-        model.set_train(False)
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_no_past_slice.stop_gradient = True
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-        output_from_past_slice.stop_gradient = True
-
-        # test that outputs are equal for slice
-        assert np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3)
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        model = MarianDecoder(config=config)
-        model.set_train(False)
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            axis=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_no_past_slice.stop_gradients = True
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-        output_from_past_slice.stop_gradients = True
-
-        # test that outputs are equal for slice
-        assert np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class MarianStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (MarianDecoder, MarianForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (MarianForCausalLM,) if is_mindspore_available() else ()
-    test_pruning = False
-    is_encoder_decoder = False
-
-    def setUp(
-        self,
-    ):
-        self.model_tester = MarianStandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=MarianConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_attn_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
-        return
\ No newline at end of file
diff --git a/tests/transformers/models/markuplm/__init__.py b/tests/transformers/models/markuplm/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/markuplm/test_modeling_markuplm.py b/tests/transformers/models/markuplm/test_modeling_markuplm.py
deleted file mode 100644
index 18a0ce3df..000000000
--- a/tests/transformers/models/markuplm/test_modeling_markuplm.py
+++ /dev/null
@@ -1,378 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Hugging Face Team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from mindnlp.transformers import MarkupLMConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-from mindnlp.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import nn, ops
-
-    from mindnlp.transformers import (
-        MarkupLMForQuestionAnswering,
-        MarkupLMForSequenceClassification,
-        MarkupLMForTokenClassification,
-        MarkupLMModel,
-    )
-
-from mindnlp.transformers import MarkupLMFeatureExtractor, MarkupLMProcessor, MarkupLMTokenizer
-
-
-class MarkupLMModelTester:
-    """You can also import this e.g from .test_modeling_markuplm import MarkupLMModelTester"""
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-        max_xpath_tag_unit_embeddings=20,
-        max_xpath_subs_unit_embeddings=30,
-        tag_pad_id=2,
-        subs_pad_id=2,
-        max_depth=10,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.scope = scope
-        self.max_xpath_tag_unit_embeddings = max_xpath_tag_unit_embeddings
-        self.max_xpath_subs_unit_embeddings = max_xpath_subs_unit_embeddings
-        self.tag_pad_id = tag_pad_id
-        self.subs_pad_id = subs_pad_id
-        self.max_depth = max_depth
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        xpath_tags_seq = ids_tensor(
-            [self.batch_size, self.seq_length, self.max_depth], self.max_xpath_tag_unit_embeddings
-        )
-
-        xpath_subs_seq = ids_tensor(
-            [self.batch_size, self.seq_length, self.max_depth], self.max_xpath_subs_unit_embeddings
-        )
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            xpath_tags_seq,
-            xpath_subs_seq,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-        )
-
-    def get_config(self):
-        return MarkupLMConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            max_xpath_tag_unit_embeddings=self.max_xpath_tag_unit_embeddings,
-            max_xpath_subs_unit_embeddings=self.max_xpath_subs_unit_embeddings,
-            tag_pad_id=self.tag_pad_id,
-            subs_pad_id=self.subs_pad_id,
-            max_depth=self.max_depth,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        xpath_tags_seq,
-        xpath_subs_seq,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-    ):
-        model = MarkupLMModel(config=config)
-        model.set_train(False)
-        print("Configs:", model.config.tag_pad_id, model.config.subs_pad_id)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_sequence_classification(
-        self,
-        config,
-        input_ids,
-        xpath_tags_seq,
-        xpath_subs_seq,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = MarkupLMForSequenceClassification(config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            xpath_tags_seq=xpath_tags_seq,
-            xpath_subs_seq=xpath_subs_seq,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self,
-        config,
-        input_ids,
-        xpath_tags_seq,
-        xpath_subs_seq,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = MarkupLMForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            xpath_tags_seq=xpath_tags_seq,
-            xpath_subs_seq=xpath_subs_seq,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_question_answering(
-        self,
-        config,
-        input_ids,
-        xpath_tags_seq,
-        xpath_subs_seq,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-    ):
-        model = MarkupLMForQuestionAnswering(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            xpath_tags_seq=xpath_tags_seq,
-            xpath_subs_seq=xpath_subs_seq,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            xpath_tags_seq,
-            xpath_subs_seq,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "xpath_tags_seq": xpath_tags_seq,
-            "xpath_subs_seq": xpath_subs_seq,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class MarkupLMModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            #MarkupLMModel,
-            MarkupLMForSequenceClassification,
-            MarkupLMForTokenClassification,
-            MarkupLMForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else None
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": MarkupLMModel,
-            "question-answering": MarkupLMForQuestionAnswering,
-            "text-classification": MarkupLMForSequenceClassification,
-            "token-classification": MarkupLMForTokenClassification,
-            "zero-shot": MarkupLMForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        # ValueError: Nodes must be of type `List[str]` (single pretokenized example), or `List[List[str]]`
-        # (batch of pretokenized examples).
-        return True
-
-    def setUp(self):
-        self.model_tester = MarkupLMModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MarkupLMConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-
-def prepare_html_string():
-    html_string = """
-    <!DOCTYPE html>
-    <html>
-    <head>
-    <title>Page Title</title>
-    </head>
-    <body>
-
-    <h1>This is a Heading</h1>
-    <p>This is a paragraph.</p>
-
-    </body>
-    </html>
-    """
-
-    return html_string
-
-
-@require_mindspore
-class MarkupLMModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_processor(self):
-        # TODO use from_pretrained here
-        feature_extractor = MarkupLMFeatureExtractor()
-        tokenizer = MarkupLMTokenizer.from_pretrained("microsoft/markuplm-base", from_pt = True)
-
-        return MarkupLMProcessor(feature_extractor, tokenizer)
-
-    @slow
-    def test_forward_pass_no_head(self):
-        model = MarkupLMModel.from_pretrained("microsoft/markuplm-base", from_pt = True)
-
-        processor = self.default_processor
-
-        inputs = processor(prepare_html_string(), return_tensors="ms")
-
-        # forward pass
-        with mindspore._no_grad():
-            outputs = model(**inputs)
-
-        # verify the last hidden states
-        expected_shape = (1, 14, 768)
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-        import numpy as np
-        expected_slice = np.array(
-            [[0.0675, -0.0052, 0.5001], [-0.2281, 0.0802, 0.2192], [-0.0583, -0.3311, 0.1185]]
-        )
-        self.assertTrue(np.allclose(outputs.last_hidden_state[0, :3, :3].numpy(), expected_slice, atol=1e-2))
diff --git a/tests/transformers/models/mask2former/__init__.py b/tests/transformers/models/mask2former/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mask2former/test_image_processing_mask2former.py b/tests/transformers/models/mask2former/test_image_processing_mask2former.py
deleted file mode 100644
index 47a18a6cc..000000000
--- a/tests/transformers/models/mask2former/test_image_processing_mask2former.py
+++ /dev/null
@@ -1,496 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore Mask2Former processor. """
-# pylint: disable=line-too-long
-
-import unittest
-
-import numpy as np
-from mindspore import ops
-from datasets import load_dataset
-from huggingface_hub import hf_hub_download
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_mindspore_available():
-    import mindspore
-
-    if is_vision_available():
-        from mindnlp.transformers import Mask2FormerImageProcessor
-        from mindnlp.transformers.models.mask2former.image_processing_mask2former import binary_mask_to_rle
-        from mindnlp.transformers.models.mask2former.modeling_mask2former import Mask2FormerForUniversalSegmentationOutput
-
-if is_vision_available():
-    from PIL import Image
-
-
-class Mask2FormerImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        size=None,
-        do_resize=True,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        num_labels=10,
-        do_reduce_labels=True,
-        ignore_index=255,
-    ):
-        super().__init__()
-
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = {"shortest_edge": 32, "longest_edge": 1333} if size is None else size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.size_divisor = 0
-        # for the post_process_functions
-        self.batch_size = 2
-        self.num_queries = 3
-        self.num_classes = 2
-        self.height = 3
-        self.width = 4
-        self.num_labels = num_labels
-        self.do_reduce_labels = do_reduce_labels
-        self.ignore_index = ignore_index
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "size_divisor": self.size_divisor,
-            "num_labels": self.num_labels,
-            "do_reduce_labels": self.do_reduce_labels,
-            "ignore_index": self.ignore_index,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to Mask2FormerImageProcessor,
-        assuming do_resize is set to True with a scalar size.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-    def get_fake_mask2former_outputs(self):
-        return Mask2FormerForUniversalSegmentationOutput(
-            # +1 for null class
-            class_queries_logits=ops.randn((self.batch_size, self.num_queries, self.num_classes + 1)),
-            masks_queries_logits=ops.randn((self.batch_size, self.num_queries, self.height, self.width)),
-        )
-
-    def expected_output_image_shape(self, images):
-        height, width = self.get_expected_values(images, batched=True)
-        return self.num_channels, height, width
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class Mask2FormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = Mask2FormerImageProcessor if (is_vision_available() and is_mindspore_available()) else None
-
-    def setUp(self):
-        self.image_processor_tester = Mask2FormerImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "ignore_index"))
-        self.assertTrue(hasattr(image_processing, "num_labels"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 32, "longest_edge": 1333})
-        self.assertEqual(image_processor.size_divisor, 0)
-
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42, max_size=84, size_divisibility=8
-        )
-        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
-        self.assertEqual(image_processor.size_divisor, 8)
-
-    def comm_get_image_processing_inputs(
-        self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np"
-    ):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # prepare image and target
-        num_labels = self.image_processor_tester.num_labels
-        annotations = None
-        instance_id_to_semantic_id = None
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        if with_segmentation_maps:
-            high = num_labels
-            if is_instance_map:
-                labels_expanded = list(range(num_labels)) * 2
-                instance_id_to_semantic_id = dict(enumerate(labels_expanded))
-            annotations = [
-                np.random.randint(0, high * 2, (img.size[1], img.size[0])).astype(np.uint8) for img in image_inputs
-            ]
-            if segmentation_type == "pil":
-                annotations = [Image.fromarray(annotation) for annotation in annotations]
-
-        inputs = image_processing(
-            image_inputs,
-            annotations,
-            return_tensors="ms",
-            instance_id_to_semantic_id=instance_id_to_semantic_id,
-            pad_and_return_pixel_mask=True,
-        )
-
-        return inputs
-
-    def test_with_size_divisor(self):
-        size_divisors = [8, 16, 32]
-        weird_input_sizes = [(407, 802), (582, 1094)]
-        for size_divisor in size_divisors:
-            image_processor_dict = {**self.image_processor_dict, **{"size_divisor": size_divisor}}
-            image_processing = self.image_processing_class(**image_processor_dict)
-            for weird_input_size in weird_input_sizes:
-                inputs = image_processing([np.ones((3, *weird_input_size))], return_tensors="ms")
-                pixel_values = inputs["pixel_values"]
-                # check if divisible
-                self.assertTrue((pixel_values.shape[-1] % size_divisor) == 0)
-                self.assertTrue((pixel_values.shape[-2] % size_divisor) == 0)
-
-    def test_call_with_segmentation_maps(self):
-        def common(is_instance_map=False, segmentation_type=None):
-            inputs = self.comm_get_image_processing_inputs(
-                with_segmentation_maps=True, is_instance_map=is_instance_map, segmentation_type=segmentation_type
-            )
-
-            mask_labels = inputs["mask_labels"]
-            class_labels = inputs["class_labels"]
-            pixel_values = inputs["pixel_values"]
-
-            # check the batch_size
-            for mask_label, class_label in zip(mask_labels, class_labels):
-                self.assertEqual(mask_label.shape[0], class_label.shape[0])
-                # this ensure padding has happened
-                self.assertEqual(mask_label.shape[1:], pixel_values.shape[2:])
-
-        common()
-        common(is_instance_map=True)
-        common(is_instance_map=False, segmentation_type="pil")
-        common(is_instance_map=True, segmentation_type="pil")
-
-    def test_integration_instance_segmentation(self):
-        # load 2 images and corresponding annotations from the hub
-        repo_id = "nielsr/image-segmentation-toy-data"
-        image1 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_image_1.png", repo_type="dataset")
-        )
-        image2 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_image_2.png", repo_type="dataset")
-        )
-        annotation1 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_annotation_1.png", repo_type="dataset")
-        )
-        annotation2 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="instance_segmentation_annotation_2.png", repo_type="dataset")
-        )
-
-        # get instance segmentations and instance-to-segmentation mappings
-        def get_instance_segmentation_and_mapping(annotation):
-            instance_seg = np.array(annotation)[:, :, 1]
-            class_id_map = np.array(annotation)[:, :, 0]
-            class_labels = np.unique(class_id_map)
-
-            # create mapping between instance IDs and semantic category IDs
-            inst2class = {}
-            for label in class_labels:
-                instance_ids = np.unique(instance_seg[class_id_map == label])
-                inst2class.update({i: label for i in instance_ids})
-
-            return instance_seg, inst2class
-
-        instance_seg1, inst2class1 = get_instance_segmentation_and_mapping(annotation1)
-        instance_seg2, inst2class2 = get_instance_segmentation_and_mapping(annotation2)
-
-        # create a image processor
-        image_processing = Mask2FormerImageProcessor(reduce_labels=True, ignore_index=255, size=(512, 512))
-
-        # prepare the images and annotations
-        inputs = image_processing(
-            [image1, image2],
-            [instance_seg1, instance_seg2],
-            instance_id_to_semantic_id=[inst2class1, inst2class2],
-            return_tensors="ms",
-        )
-
-        # verify the pixel values and pixel mask
-        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 512))
-        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 512))
-
-        # verify the class labels
-        self.assertEqual(len(inputs["class_labels"]), 2)
-        self.assertTrue(np.allclose(inputs["class_labels"][0].numpy(), mindspore.tensor([30, 55]).numpy()))
-        self.assertTrue(np.allclose(inputs["class_labels"][1].numpy(), mindspore.tensor([4, 4, 23, 55]).numpy()))
-
-        # verify the mask labels
-        self.assertEqual(len(inputs["mask_labels"]), 2)
-        self.assertEqual(inputs["mask_labels"][0].shape, (2, 512, 512))
-        self.assertEqual(inputs["mask_labels"][1].shape, (4, 512, 512))
-        self.assertEqual(inputs["mask_labels"][0].sum().item(), 41527.0)
-        self.assertEqual(inputs["mask_labels"][1].sum().item(), 26259.0)
-
-    def test_integration_semantic_segmentation(self):
-        # load 2 images and corresponding semantic annotations from the hub
-        repo_id = "nielsr/image-segmentation-toy-data"
-        image1 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_image_1.png", repo_type="dataset")
-        )
-        image2 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_image_2.png", repo_type="dataset")
-        )
-        annotation1 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_annotation_1.png", repo_type="dataset")
-        )
-        annotation2 = Image.open(
-            hf_hub_download(repo_id=repo_id, filename="semantic_segmentation_annotation_2.png", repo_type="dataset")
-        )
-
-        # create a image processor
-        image_processing = Mask2FormerImageProcessor(reduce_labels=True, ignore_index=255, size=(512, 512))
-
-        # prepare the images and annotations
-        inputs = image_processing(
-            [image1, image2],
-            [annotation1, annotation2],
-            return_tensors="ms",
-        )
-
-        # verify the pixel values and pixel mask
-        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 512))
-        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 512))
-
-        # verify the class labels
-        self.assertEqual(len(inputs["class_labels"]), 2)
-        self.assertTrue(np.allclose(inputs["class_labels"][0].numpy(), mindspore.tensor([2, 4, 60]).numpy()))
-        self.assertTrue(np.allclose(inputs["class_labels"][1].numpy(), mindspore.tensor([0, 3, 7, 8, 15, 28, 30, 143]).numpy()))
-
-        # verify the mask labels
-        self.assertEqual(len(inputs["mask_labels"]), 2)
-        self.assertEqual(inputs["mask_labels"][0].shape, (3, 512, 512))
-        self.assertEqual(inputs["mask_labels"][1].shape, (8, 512, 512))
-        self.assertEqual(inputs["mask_labels"][0].sum().item(), 170200.0)
-        self.assertEqual(inputs["mask_labels"][1].sum().item(), 257036.0)
-
-    def test_integration_panoptic_segmentation(self):
-        # load 2 images and corresponding panoptic annotations from the hub
-        dataset = load_dataset("nielsr/ade20k-panoptic-demo")
-        image1 = dataset["train"][0]["image"]
-        image2 = dataset["train"][1]["image"]
-        segments_info1 = dataset["train"][0]["segments_info"]
-        segments_info2 = dataset["train"][1]["segments_info"]
-        annotation1 = dataset["train"][0]["label"]
-        annotation2 = dataset["train"][1]["label"]
-
-        def rgb_to_id(color):
-            if isinstance(color, np.ndarray) and len(color.shape) == 3:
-                if color.dtype == np.uint8:
-                    color = color.astype(np.int32)
-                return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
-            return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
-
-        def create_panoptic_map(annotation, segments_info):
-            annotation = np.array(annotation)
-            # convert RGB to segment IDs per pixel
-            # 0 is the "ignore" label, for which we don't need to make binary masks
-            panoptic_map = rgb_to_id(annotation)
-
-            # create mapping between segment IDs and semantic classes
-            inst2class = {segment["id"]: segment["category_id"] for segment in segments_info}
-
-            return panoptic_map, inst2class
-
-        panoptic_map1, inst2class1 = create_panoptic_map(annotation1, segments_info1)
-        panoptic_map2, inst2class2 = create_panoptic_map(annotation2, segments_info2)
-
-        # create a image processor
-        image_processing = Mask2FormerImageProcessor(ignore_index=0, do_resize=False)
-
-        # prepare the images and annotations
-        pixel_values_list = [np.moveaxis(np.array(image1), -1, 0), np.moveaxis(np.array(image2), -1, 0)]
-        inputs = image_processing.encode_inputs(
-            pixel_values_list,
-            [panoptic_map1, panoptic_map2],
-            instance_id_to_semantic_id=[inst2class1, inst2class2],
-            return_tensors="ms",
-        )
-
-        # verify the pixel values and pixel mask
-        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 711))
-        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 711))
-
-        # verify the class labels
-        self.assertEqual(len(inputs["class_labels"]), 2)
-        expected_class_labels = mindspore.tensor([4, 17, 32, 42, 42, 42, 42, 42, 42, 42, 32, 12, 12, 12, 12, 12, 42, 42, 12, 12, 12, 42, 12, 12, 12, 12, 12, 3, 12, 12, 12, 12, 42, 42, 42, 12, 42, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 5, 12, 12, 12, 12, 12, 12, 12, 0, 43, 43, 43, 96, 43, 104, 43, 31, 125, 31, 125, 138, 87, 125, 149, 138, 125, 87, 87])  # fmt: skip
-        self.assertTrue(np.allclose(inputs["class_labels"][0].numpy(), mindspore.tensor(expected_class_labels).numpy()))
-        expected_class_labels = mindspore.tensor([19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 67, 82, 19, 19, 17, 19, 19, 19, 19, 19, 19, 19, 19, 19, 12, 12, 42, 12, 12, 12, 12, 3, 14, 12, 12, 12, 12, 12, 12, 12, 12, 14, 5, 12, 12, 0, 115, 43, 43, 115, 43, 43, 43, 8, 8, 8, 138, 138, 125, 143])  # fmt: skip
-        self.assertTrue(np.allclose(inputs["class_labels"][1].numpy(), expected_class_labels.numpy()))
-
-        # verify the mask labels
-        self.assertEqual(len(inputs["mask_labels"]), 2)
-        self.assertEqual(inputs["mask_labels"][0].shape, (79, 512, 711))
-        self.assertEqual(inputs["mask_labels"][1].shape, (61, 512, 711))
-        self.assertEqual(inputs["mask_labels"][0].sum().item(), 315193.0)
-        self.assertEqual(inputs["mask_labels"][1].sum().item(), 350747.0)
-
-    def test_binary_mask_to_rle(self):
-        fake_binary_mask = np.zeros((20, 50))
-        fake_binary_mask[0, 20:] = 1
-        fake_binary_mask[1, :15] = 1
-        fake_binary_mask[5, :10] = 1
-
-        rle = binary_mask_to_rle(fake_binary_mask)
-        self.assertEqual(len(rle), 4)
-        self.assertEqual(rle[0], 21)
-        self.assertEqual(rle[1], 45)
-
-    def test_post_process_semantic_segmentation(self):
-        fature_extractor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
-        outputs = self.image_processor_tester.get_fake_mask2former_outputs()
-
-        segmentation = fature_extractor.post_process_semantic_segmentation(outputs)
-
-        self.assertEqual(len(segmentation), self.image_processor_tester.batch_size)
-        self.assertEqual(segmentation[0].shape, (384, 384))
-
-        target_sizes = [(1, 4) for i in range(self.image_processor_tester.batch_size)]
-        segmentation = fature_extractor.post_process_semantic_segmentation(outputs, target_sizes=target_sizes)
-
-        self.assertEqual(segmentation[0].shape, target_sizes[0])
-
-    def test_post_process_instance_segmentation(self):
-        image_processor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
-        outputs = self.image_processor_tester.get_fake_mask2former_outputs()
-        segmentation = image_processor.post_process_instance_segmentation(outputs, threshold=0)
-
-        self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
-        for el in segmentation:
-            self.assertTrue("segmentation" in el)
-            self.assertTrue("segments_info" in el)
-            self.assertEqual(type(el["segments_info"]), list)
-            self.assertEqual(el["segmentation"].shape, (384, 384))
-
-        segmentation = image_processor.post_process_instance_segmentation(
-            outputs, threshold=0, return_binary_maps=True
-        )
-
-        self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
-        for el in segmentation:
-            self.assertTrue("segmentation" in el)
-            self.assertTrue("segments_info" in el)
-            self.assertEqual(type(el["segments_info"]), list)
-            self.assertEqual(len(el["segmentation"].shape), 3)
-            self.assertEqual(el["segmentation"].shape[1:], (384, 384))
-
-    def test_post_process_panoptic_segmentation(self):
-        image_processing = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
-        outputs = self.image_processor_tester.get_fake_mask2former_outputs()
-        segmentation = image_processing.post_process_panoptic_segmentation(outputs, threshold=0)
-
-        self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
-        for el in segmentation:
-            self.assertTrue("segmentation" in el)
-            self.assertTrue("segments_info" in el)
-            self.assertEqual(type(el["segments_info"]), list)
-            self.assertEqual(el["segmentation"].shape, (384, 384))
-
-    def test_post_process_label_fusing(self):
-        image_processor = self.image_processing_class(num_labels=self.image_processor_tester.num_classes)
-        outputs = self.image_processor_tester.get_fake_mask2former_outputs()
-
-        segmentation = image_processor.post_process_panoptic_segmentation(
-            outputs, threshold=0, mask_threshold=0, overlap_mask_area_threshold=0
-        )
-        unfused_segments = [el["segments_info"] for el in segmentation]
-
-        fused_segmentation = image_processor.post_process_panoptic_segmentation(
-            outputs, threshold=0, mask_threshold=0, overlap_mask_area_threshold=0, label_ids_to_fuse={1}
-        )
-        fused_segments = [el["segments_info"] for el in fused_segmentation]
-
-        for el_unfused, el_fused in zip(unfused_segments, fused_segments):
-            if len(el_unfused) == 0:
-                self.assertEqual(len(el_unfused), len(el_fused))
-                continue
-
-            # Get number of segments to be fused
-            fuse_targets = [1 for el in el_unfused if el["label_id"] in {1}]
-            num_to_fuse = 0 if len(fuse_targets) == 0 else sum(fuse_targets) - 1
-            # Expected number of segments after fusing
-            expected_num_segments = max(el["id"] for el in el_unfused) - num_to_fuse
-            num_segments_fused = max(el["id"] for el in el_fused)
-            self.assertEqual(num_segments_fused, expected_num_segments)
diff --git a/tests/transformers/models/mask2former/test_modeling_mask2former.py b/tests/transformers/models/mask2former/test_modeling_mask2former.py
deleted file mode 100644
index c42d20026..000000000
--- a/tests/transformers/models/mask2former/test_modeling_mask2former.py
+++ /dev/null
@@ -1,441 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore Mask2Former model. """
-# pylint: disable=line-too-long
-# pylint: disable=not-callable
-
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import Mask2FormerConfig
-from mindnlp.utils import is_mindspore_available, is_vision_available
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor
-#from ....test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import Mask2FormerForUniversalSegmentation, Mask2FormerModel
-
-    if is_vision_available():
-        from mindnlp.transformers import Mask2FormerImageProcessor
-
-if is_vision_available():
-    from PIL import Image
-
-
-class Mask2FormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        is_training=True,
-        use_auxiliary_loss=False,
-        num_queries=10,
-        num_channels=3,
-        min_size=32 * 8,
-        max_size=32 * 8,
-        num_labels=4,
-        hidden_dim=64,
-        num_attention_heads=4,
-        num_hidden_layers=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.is_training = is_training
-        self.use_auxiliary_loss = use_auxiliary_loss
-        self.num_queries = num_queries
-        self.num_channels = num_channels
-        self.min_size = min_size
-        self.max_size = max_size
-        self.num_labels = num_labels
-        self.hidden_dim = hidden_dim
-        self.mask_feature_size = hidden_dim
-        self.num_attention_heads = num_attention_heads
-        self.num_hidden_layers = num_hidden_layers
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size])
-
-        pixel_mask = ops.ones([self.batch_size, self.min_size, self.max_size])
-
-        mask_labels = (
-            ops.rand(self.batch_size, self.num_labels, self.min_size, self.max_size) > 0.5
-        ).float()
-        class_labels = (ops.rand(self.batch_size, self.num_labels) > 0.5).long()
-
-        config = self.get_config()
-        return config, pixel_values, pixel_mask, mask_labels, class_labels
-
-    def get_config(self):
-        config = Mask2FormerConfig(
-            hidden_size=self.hidden_dim,
-            num_attention_heads=self.num_attention_heads,
-            num_hidden_layers=self.num_hidden_layers,
-            encoder_feedforward_dim=16,
-            dim_feedforward=32,
-            num_queries=self.num_queries,
-            num_labels=self.num_labels,
-            decoder_layers=2,
-            encoder_layers=2,
-            feature_size=16,
-        )
-        config.num_queries = self.num_queries
-        config.num_labels = self.num_labels
-
-        config.backbone_config.embed_dim = 16
-        config.backbone_config.depths = [1, 1, 1, 1]
-        config.backbone_config.hidden_size = 16
-        config.backbone_config.num_channels = self.num_channels
-        config.backbone_config.num_heads = [1, 1, 2, 2]
-        config.backbone = None
-
-        config.hidden_dim = self.hidden_dim
-        config.mask_feature_size = self.hidden_dim
-        config.feature_size = self.hidden_dim
-        return config
-
-    def prepare_config_and_inputs_for_common(self):
-        config, pixel_values, pixel_mask, _, _ = self.prepare_config_and_inputs()
-        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
-        return config, inputs_dict
-
-    def check_output_hidden_state(self, output, config):
-        encoder_hidden_states = output.encoder_hidden_states
-        pixel_decoder_hidden_states = output.pixel_decoder_hidden_states
-        transformer_decoder_hidden_states = output.transformer_decoder_hidden_states
-
-        self.parent.assertTrue(len(encoder_hidden_states), len(config.backbone_config.depths))
-        self.parent.assertTrue(len(pixel_decoder_hidden_states), len(config.backbone_config.depths))
-        self.parent.assertTrue(len(transformer_decoder_hidden_states), config.decoder_layers)
-
-    def create_and_check_mask2former_model(self, config, pixel_values, pixel_mask, output_hidden_states=False):
-        model = Mask2FormerModel(config=config)
-        model.set_train(False)
-
-        output = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        output = model(pixel_values, output_hidden_states=True)
-
-        self.parent.assertEqual(
-            output.transformer_decoder_last_hidden_state.shape,
-            (self.batch_size, self.num_queries, self.hidden_dim),
-        )
-        # let's ensure the other two hidden state exists
-        self.parent.assertTrue(output.pixel_decoder_last_hidden_state is not None)
-        self.parent.assertTrue(output.encoder_last_hidden_state is not None)
-
-        if output_hidden_states:
-            self.check_output_hidden_state(output, config)
-
-    def create_and_check_mask2former_instance_segmentation_head_model(
-        self, config, pixel_values, pixel_mask, mask_labels, class_labels
-    ):
-        model = Mask2FormerForUniversalSegmentation(config=config)
-        model.set_train(False)
-
-        def comm_check_on_output(result):
-            # let's still check that all the required stuff is there
-            self.parent.assertTrue(result.transformer_decoder_last_hidden_state is not None)
-            self.parent.assertTrue(result.pixel_decoder_last_hidden_state is not None)
-            self.parent.assertTrue(result.encoder_last_hidden_state is not None)
-            # okay, now we need to check the logits shape
-            # due to the encoder compression, masks have a //4 spatial size
-            self.parent.assertEqual(
-                result.masks_queries_logits.shape,
-                (self.batch_size, self.num_queries, self.min_size // 4, self.max_size // 4),
-            )
-            # + 1 for null class
-            self.parent.assertEqual(
-                result.class_queries_logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1)
-            )
-
-        result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-        result = model(pixel_values)
-
-        comm_check_on_output(result)
-
-        result = model(
-            pixel_values=pixel_values, pixel_mask=pixel_mask, mask_labels=mask_labels, class_labels=class_labels
-        )
-
-        comm_check_on_output(result)
-
-        self.parent.assertTrue(result.loss is not None)
-        self.parent.assertEqual(result.loss.shape, tuple())
-
-
-@require_mindspore
-class Mask2FormerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (Mask2FormerModel, Mask2FormerForUniversalSegmentation) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"image-feature-extraction": Mask2FormerModel} if is_mindspore_available() else {}
-
-    is_encoder_decoder = False
-    test_pruning = False
-    test_head_masking = False
-    test_missing_keys = False
-
-    def setUp(self):
-        self.model_tester = Mask2FormerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Mask2FormerConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_mask2former_model(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.create_and_check_mask2former_model(config, **inputs, output_hidden_states=False)
-
-    def test_mask2former_instance_segmentation_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mask2former_instance_segmentation_head_model(*config_and_inputs)
-
-    @unittest.skip(reason="Mask2Former does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Mask2Former does not have a get_input_embeddings method")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Mask2Former is not a generative model")
-    def test_generate_without_input_ids(self):
-        pass
-
-    @unittest.skip(reason="Mask2Former does not use token embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(
-        reason="Mask2Former has some layers using `add_module` which doesn't work well with `nn.DataParallel`"
-    )
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-    @unittest.skip("ignore due to the difference for default bias initialization between mindspore.nn.Dense and torch.nn.Linear")
-    def test_initialization(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ["facebook/mask2former-swin-small-coco-instance"]:
-            model = Mask2FormerModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_model_with_labels(self):
-        size = (self.model_tester.min_size,) * 2
-        inputs = {
-            "pixel_values": ops.randn(2, 3, *size),
-            "mask_labels": ops.randn(2, 10, *size),
-            "class_labels": ops.zeros(2, 10).long(),
-        }
-        config = self.model_tester.get_config()
-
-        model = Mask2FormerForUniversalSegmentation(config)
-        outputs = model(**inputs)
-        self.assertTrue(outputs.loss is not None)
-
-    def test_hidden_states_output(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.create_and_check_mask2former_model(config, **inputs, output_hidden_states=True)
-
-    def test_attention_outputs(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            outputs = model(**inputs, output_attentions=True)
-            self.assertTrue(outputs.attentions is not None)
-
-    @unittest.skip("skip train")
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        model_class = self.all_model_classes[1]
-        config, pixel_values, pixel_mask, mask_labels, class_labels = self.model_tester.prepare_config_and_inputs()
-
-        model = model_class(config)
-        model.set_train(True)
-
-        loss = model(pixel_values, mask_labels=mask_labels, class_labels=class_labels).loss
-        loss.backward()
-
-    @unittest.skip("MindSpore has no retain grad")
-    def test_retain_grad_hidden_states_attentions(self):
-        model_class = self.all_model_classes[1]
-        config, pixel_values, pixel_mask, mask_labels, class_labels = self.model_tester.prepare_config_and_inputs()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        model = model_class(config)
-        model.set_train(True)
-
-        outputs = model(pixel_values, mask_labels=mask_labels, class_labels=class_labels)
-
-        encoder_hidden_states = outputs.encoder_hidden_states[0]
-        encoder_hidden_states.retain_grad()
-
-        pixel_decoder_hidden_states = outputs.pixel_decoder_hidden_states[0]
-        pixel_decoder_hidden_states.retain_grad()
-
-        transformer_decoder_hidden_states = outputs.transformer_decoder_hidden_states[0]
-        transformer_decoder_hidden_states.retain_grad()
-
-        attentions = outputs.attentions[0]
-        attentions.retain_grad()
-
-        outputs.loss.backward(retain_graph=True)
-
-        self.assertIsNotNone(encoder_hidden_states.grad)
-        self.assertIsNotNone(pixel_decoder_hidden_states.grad)
-        self.assertIsNotNone(transformer_decoder_hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-
-TOLERANCE = 1e-4
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_vision
-@slow
-class Mask2FormerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def model_checkpoints(self):
-        return "facebook/mask2former-swin-small-coco-instance"
-
-    @cached_property
-    def default_image_processor(self):
-        return Mask2FormerImageProcessor.from_pretrained(self.model_checkpoints) if is_vision_available() else None
-
-    def test_inference_no_head(self):
-        model = Mask2FormerModel.from_pretrained(self.model_checkpoints)
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(image, return_tensors="ms")
-        inputs_shape = inputs["pixel_values"].shape
-        # check size is divisible by 32
-        self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
-        # check size
-        self.assertEqual(inputs_shape, (1, 3, 384, 384))
-
-        outputs = model(**inputs)
-
-        expected_slice_hidden_state = mindspore.tensor(
-            [[-0.2790, -1.0717, -1.1668], [-0.5128, -0.3128, -0.4987], [-0.5832, 0.1971, -0.0197]]
-        )
-        self.assertTrue(
-            np.allclose(
-                outputs.encoder_last_hidden_state[0, 0, :3, :3].numpy(), expected_slice_hidden_state.numpy(), atol=TOLERANCE
-            )
-        )
-
-        expected_slice_hidden_state = mindspore.tensor(
-            [[0.8973, 1.1847, 1.1776], [1.1934, 1.5040, 1.5128], [1.1153, 1.4486, 1.4951]]
-        )
-        self.assertTrue(
-            np.allclose(
-                outputs.pixel_decoder_last_hidden_state[0, 0, :3, :3].numpy(), expected_slice_hidden_state.numpy(), atol=TOLERANCE
-            )
-        )
-
-        expected_slice_hidden_state = mindspore.tensor(
-            [[2.1152, 1.7000, -0.8603], [1.5808, 1.8004, -0.9353], [1.6043, 1.7495, -0.5999]]
-        )
-        self.assertTrue(
-            np.allclose(
-                outputs.transformer_decoder_last_hidden_state[0, :3, :3].numpy(), expected_slice_hidden_state.numpy(), atol=TOLERANCE
-            )
-        )
-
-    def test_inference_universal_segmentation_head(self):
-        model = Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints).set_train(False)
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(image, return_tensors="ms")
-        inputs_shape = inputs["pixel_values"].shape
-        # check size is divisible by 32
-        self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
-        # check size
-        self.assertEqual(inputs_shape, (1, 3, 384, 384))
-
-        outputs = model(**inputs)
-        # masks_queries_logits
-        masks_queries_logits = outputs.masks_queries_logits
-        self.assertEqual(
-            masks_queries_logits.shape, (1, model.config.num_queries, inputs_shape[-2] // 4, inputs_shape[-1] // 4)
-        )
-        expected_slice = [
-            [-8.7839, -9.0056, -8.8121],
-            [-7.4104, -7.0313, -6.5401],
-            [-6.6105, -6.3427, -6.4675],
-        ]
-        expected_slice = mindspore.tensor(expected_slice)
-        self.assertTrue(np.allclose(masks_queries_logits[0, 0, :3, :3].numpy(), expected_slice.numpy(), atol=TOLERANCE))
-        # class_queries_logits
-        class_queries_logits = outputs.class_queries_logits
-        self.assertEqual(class_queries_logits.shape, (1, model.config.num_queries, model.config.num_labels + 1))
-        expected_slice = mindspore.tensor(
-            [
-                [1.8324, -8.0835, -4.1922],
-                [0.8450, -9.0050, -3.6053],
-                [0.3045, -7.7293, -3.0275],
-            ]
-        )
-        self.assertTrue(np.allclose(outputs.class_queries_logits[0, :3, :3].numpy(), expected_slice.numpy(), atol=TOLERANCE))
-
-    def test_inference_fp16(self):
-        model = (
-            Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints)
-            .set_train(False)
-        )
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(image, return_tensors="ms")    #.to(dtype=mindspore.float16)
-
-        _ = model(**inputs)
-
-    def test_with_segmentation_maps_and_loss(self):
-        model = Mask2FormerForUniversalSegmentation.from_pretrained(self.model_checkpoints).set_train(False)
-        image_processor = self.default_image_processor
-
-        inputs = image_processor(
-            [np.zeros((3, 800, 1333)), np.zeros((3, 800, 1333))],
-            segmentation_maps=[np.zeros((384, 384)).astype(np.float32), np.zeros((384, 384)).astype(np.float32)],
-            return_tensors="ms",
-        )
-
-        inputs["pixel_values"] = inputs["pixel_values"]
-        inputs["mask_labels"] = inputs["mask_labels"]
-        inputs["class_labels"] = inputs["class_labels"]
-
-        outputs = model(**inputs)
-
-        self.assertTrue(outputs.loss is not None)
diff --git a/tests/transformers/models/maskformer/__init__.py b/tests/transformers/models/maskformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/maskformer/test_modeling_maskformer.py b/tests/transformers/models/maskformer/test_modeling_maskformer.py
deleted file mode 100644
index f0530c492..000000000
--- a/tests/transformers/models/maskformer/test_modeling_maskformer.py
+++ /dev/null
@@ -1,605 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore MaskFormer model."""
-
-import copy
-import unittest
-
-import numpy as np
-
-from ...test_modeling_common import floats_tensor
-from mindnlp.transformers import DetrConfig, MaskFormerConfig, SwinConfig
-from mindnlp.utils import is_mindspore_available, is_vision_available
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-    from mindnlp.core.nn import functional as F
-
-    from mindnlp.transformers import MaskFormerForInstanceSegmentation, MaskFormerModel
-
-    if is_vision_available():
-        from mindnlp.transformers import MaskFormerImageProcessor
-
-if is_vision_available():
-    from PIL import Image
-
-
-class MaskFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        is_training=True,
-        use_auxiliary_loss=False,
-        num_queries=10,
-        num_channels=3,
-        min_size=32 * 4,
-        max_size=32 * 6,
-        num_labels=4,
-        mask_feature_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.is_training = is_training
-        self.use_auxiliary_loss = use_auxiliary_loss
-        self.num_queries = num_queries
-        self.num_channels = num_channels
-        self.min_size = min_size
-        self.max_size = max_size
-        self.num_labels = num_labels
-        self.mask_feature_size = mask_feature_size
-        # This is passed to the decoder config. We add it to the model tester here for testing
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size])
-
-        pixel_mask = ops.ones([self.batch_size, self.min_size, self.max_size])
-
-        mask_labels = (
-            ops.rand([self.batch_size, self.num_labels, self.min_size, self.max_size]) > 0.5
-        ).float()
-        class_labels = (ops.rand((self.batch_size, self.num_labels)) > 0.5).long()
-
-        config = self.get_config()
-        return config, pixel_values, pixel_mask, mask_labels, class_labels
-
-    def get_config(self):
-        return MaskFormerConfig.from_backbone_and_decoder_configs(
-            backbone_config=SwinConfig(
-                depths=[1, 1, 1, 1],
-                embed_dim=16,
-                hidden_size=32,
-                num_heads=[1, 1, 2, 2],
-            ),
-            backbone=None,
-            decoder_config=DetrConfig(
-                decoder_ffn_dim=64,
-                decoder_layers=self.num_hidden_layers,
-                decoder_attention_heads=self.num_attention_heads,
-                encoder_ffn_dim=64,
-                encoder_layers=self.num_hidden_layers,
-                encoder_attention_heads=self.num_attention_heads,
-                num_queries=self.num_queries,
-                d_model=self.mask_feature_size,
-            ),
-            mask_feature_size=self.mask_feature_size,
-            fpn_feature_size=self.mask_feature_size,
-            num_channels=self.num_channels,
-            num_labels=self.num_labels,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, pixel_values, pixel_mask, _, _ = self.prepare_config_and_inputs()
-        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
-        return config, inputs_dict
-
-    def check_output_hidden_state(self, output, config):
-        encoder_hidden_states = output.encoder_hidden_states
-        pixel_decoder_hidden_states = output.pixel_decoder_hidden_states
-        transformer_decoder_hidden_states = output.transformer_decoder_hidden_states
-
-        self.parent.assertTrue(len(encoder_hidden_states), len(config.backbone_config.depths))
-        self.parent.assertTrue(len(pixel_decoder_hidden_states), len(config.backbone_config.depths))
-        self.parent.assertTrue(len(transformer_decoder_hidden_states), config.decoder_config.decoder_layers)
-
-    def create_and_check_maskformer_model(self, config, pixel_values, pixel_mask, output_hidden_states=False):
-        with no_grad():
-            model = MaskFormerModel(config=config)
-            model.eval()
-
-            output = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-            output = model(pixel_values, output_hidden_states=True)
-        # the correct shape of output.transformer_decoder_hidden_states ensure the correcteness of the
-        # encoder and pixel decoder
-        self.parent.assertEqual(
-            output.transformer_decoder_last_hidden_state.shape,
-            (self.batch_size, self.num_queries, self.mask_feature_size),
-        )
-        # let's ensure the other two hidden state exists
-        self.parent.assertTrue(output.pixel_decoder_last_hidden_state is not None)
-        self.parent.assertTrue(output.encoder_last_hidden_state is not None)
-
-        if output_hidden_states:
-            self.check_output_hidden_state(output, config)
-
-    def create_and_check_maskformer_instance_segmentation_head_model(
-        self, config, pixel_values, pixel_mask, mask_labels, class_labels
-    ):
-        model = MaskFormerForInstanceSegmentation(config=config)
-        model.eval()
-
-        def comm_check_on_output(result):
-            # let's still check that all the required stuff is there
-            self.parent.assertTrue(result.transformer_decoder_last_hidden_state is not None)
-            self.parent.assertTrue(result.pixel_decoder_last_hidden_state is not None)
-            self.parent.assertTrue(result.encoder_last_hidden_state is not None)
-            # okay, now we need to check the logits shape
-            # due to the encoder compression, masks have a //4 spatial size
-            self.parent.assertEqual(
-                result.masks_queries_logits.shape,
-                (self.batch_size, self.num_queries, self.min_size // 4, self.max_size // 4),
-            )
-            # + 1 for null class
-            self.parent.assertEqual(
-                result.class_queries_logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1)
-            )
-
-        with no_grad():
-            result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
-            result = model(pixel_values)
-
-            comm_check_on_output(result)
-
-            result = model(
-                pixel_values=pixel_values, pixel_mask=pixel_mask, mask_labels=mask_labels, class_labels=class_labels
-            )
-
-        comm_check_on_output(result)
-
-        self.parent.assertTrue(result.loss is not None)
-        self.parent.assertEqual(result.loss.shape, ())
-
-
-@require_mindspore
-class MaskFormerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (MaskFormerModel, MaskFormerForInstanceSegmentation) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"image-feature-extraction": MaskFormerModel, "image-segmentation": MaskFormerForInstanceSegmentation}
-        if is_mindspore_available()
-        else {}
-    )
-
-    is_encoder_decoder = False
-    test_pruning = False
-    test_head_masking = False
-    test_missing_keys = False
-    zero_init_hidden_state = True
-
-    def setUp(self):
-        self.model_tester = MaskFormerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MaskFormerConfig, has_text_modality=False)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if return_labels:
-            if model_class in [MaskFormerForInstanceSegmentation]:
-                inputs_dict["mask_labels"] = ops.zeros(
-                    (
-                        self.model_tester.batch_size,
-                        self.model_tester.num_labels,
-                        self.model_tester.min_size,
-                        self.model_tester.max_size,
-                    ),
-                    dtype=mindspore.float32,
-                )
-                inputs_dict["class_labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.num_labels), dtype=mindspore.int64
-                )
-
-        return inputs_dict
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_maskformer_model(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.create_and_check_maskformer_model(config, **inputs, output_hidden_states=False)
-
-    def test_maskformer_instance_segmentation_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_maskformer_instance_segmentation_head_model(*config_and_inputs)
-
-    @unittest.skip(reason="MaskFormer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="MaskFormer does not have a get_input_embeddings method")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="MaskFormer is not a generative model")
-    def test_generate_without_input_ids(self):
-        pass
-
-    @unittest.skip(reason="MaskFormer does not use token embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ["facebook/maskformer-swin-small-coco"]:
-            model = MaskFormerModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_model_with_labels(self):
-        size = (self.model_tester.min_size,) * 2
-        inputs = {
-            "pixel_values": ops.randn((2, 3, *size)),
-            "mask_labels": ops.randn((2, 10, *size)),
-            "class_labels": ops.zeros(2, 10).long(),
-        }
-
-        model = MaskFormerForInstanceSegmentation(MaskFormerConfig())
-        outputs = model(**inputs)
-        self.assertTrue(outputs.loss is not None)
-
-    def test_hidden_states_output(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.create_and_check_maskformer_model(config, **inputs, output_hidden_states=True)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # Check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            # encoder_hidden_states, pixel_decoder_hidden_states, transformer_decoder_hidden_states, hidden_states
-            added_hidden_states = 4
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-
-    def test_forward_auxiliary_loss(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.use_auxiliary_loss = True
-        config.output_auxiliary_logits = True
-        config.output_hidden_states = True
-
-        # only test for object detection and segmentation model
-        for model_class in self.all_model_classes[1:]:
-            model = model_class(config)
-
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-            outputs = model(**inputs)
-
-            self.assertIsNotNone(outputs.auxiliary_logits)
-            self.assertEqual(len(outputs.auxiliary_logits), self.model_tester.num_channels - 1)
-
-    def test_batching_equivalence(self):
-        def equivalence(tensor1, tensor2):
-            return 1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=0).max()
-
-        def recursive_check(batched_object, single_row_object, model_name, key):
-            if isinstance(batched_object, (list, tuple)):
-                for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
-                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
-            elif batched_object is None:
-                return
-            else:
-                batched_row = batched_object[:1]
-                self.assertFalse(
-                    ops.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
-                )
-                self.assertTrue(
-                    (equivalence(batched_row, single_row_object)) <= 1e-03,
-                    msg=(
-                        f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
-                        f"Difference={equivalence(batched_row, single_row_object)}."
-                    ),
-                )
-
-        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            config.output_hidden_states = True
-
-            model_name = model_class.__name__
-            batched_input_prepared = self._prepare_for_class(batched_input, model_class)
-            model = model_class(config).eval()
-            batch_size = self.model_tester.batch_size
-
-            single_row_input = {}
-            for key, value in batched_input_prepared.items():
-                single_batch_shape = value.shape[0] // batch_size
-                single_row_input[key] = value[:single_batch_shape]
-
-            with no_grad():
-                model_batched_output = model(**batched_input_prepared)
-                model_row_output = model(**single_row_input)
-
-            for key in model_batched_output:
-                # remove the first zero-init queries to decoder, otherwise cos_similarity = `nan`
-                # no need to check all hidden_states, already checked separately each one
-                if key == "transformer_decoder_hidden_states":
-                    model_batched_output[key] = model_batched_output[key][1:]
-                    model_row_output[key] = model_row_output[key][1:]
-                elif key == "hidden_states":
-                    continue
-                recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
-
-    def test_backbone_selection(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-
-        config.backbone_config = None
-        config.backbone_kwargs = {"out_indices": [1, 2, 3]}
-        config.use_pretrained_backbone = True
-
-        # Load a timm backbone
-        # We can't load transformer checkpoint with timm backbone, as we can't specify features_only and out_indices
-        config.backbone = "resnet18"
-        config.use_timm_backbone = True
-
-        for model_class in self.all_model_classes:
-            model = model_class(config).eval()
-            if model.__class__.__name__ == "MaskFormerModel":
-                self.assertEqual(model.pixel_level_module.encoder.out_indices, [1, 2, 3])
-            elif model.__class__.__name__ == "MaskFormerForUniversalSegmentation":
-                self.assertEqual(model.model.pixel_level_module.encoder.out_indices, [1, 2, 3])
-
-        # Load a HF backbone
-        config.backbone = "microsoft/resnet-18"
-        config.use_timm_backbone = False
-
-        for model_class in self.all_model_classes:
-            model = model_class(config).eval()
-            if model.__class__.__name__ == "MaskFormerModel":
-                self.assertEqual(model.pixel_level_module.encoder.out_indices, [1, 2, 3])
-            elif model.__class__.__name__ == "MaskFormerForUniversalSegmentation":
-                self.assertEqual(model.model.pixel_level_module.encoder.out_indices, [1, 2, 3])
-
-
-TOLERANCE = 1e-3
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_vision
-@slow
-class MaskFormerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            MaskFormerImageProcessor.from_pretrained("facebook/maskformer-swin-small-coco")
-            if is_vision_available()
-            else None
-        )
-
-    def test_inference_no_head(self):
-        model = MaskFormerModel.from_pretrained("facebook/maskformer-swin-small-coco")
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(image, return_tensors="ms")
-        inputs_shape = inputs["pixel_values"].shape
-        # check size is divisible by 32
-        self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
-        # check size
-        self.assertEqual(inputs_shape, (1, 3, 800, 1088))
-
-        with no_grad():
-            outputs = model(**inputs)
-
-        expected_slice_hidden_state = mindspore.tensor(
-            [[-0.0482, 0.9228, 0.4951], [-0.2547, 0.8017, 0.8527], [-0.0069, 0.3385, -0.0089]]
-        )
-        print(outputs.encoder_last_hidden_state[0, 0, :3, :3], expected_slice_hidden_state)
-        self.assertTrue(
-            ops.allclose(
-                outputs.encoder_last_hidden_state[0, 0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
-            )
-        )
-
-        expected_slice_hidden_state = mindspore.tensor(
-            [[-0.8422, -0.8434, -0.9718], [-1.0144, -0.5565, -0.4195], [-1.0038, -0.4484, -0.1961]]
-        )
-        self.assertTrue(
-            ops.allclose(
-                outputs.pixel_decoder_last_hidden_state[0, 0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
-            )
-        )
-
-        expected_slice_hidden_state = mindspore.tensor(
-            [[0.2852, -0.0159, 0.9735], [0.6254, 0.1858, 0.8529], [-0.0680, -0.4116, 1.8413]]
-        )
-        self.assertTrue(
-            ops.allclose(
-                outputs.transformer_decoder_last_hidden_state[0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
-            )
-        )
-
-    def test_inference_instance_segmentation_head(self):
-        model = (
-            MaskFormerForInstanceSegmentation.from_pretrained("facebook/maskformer-swin-small-coco")
-            
-            .eval()
-        )
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(image, return_tensors="ms")
-        inputs_shape = inputs["pixel_values"].shape
-        # check size is divisible by 32
-        self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
-        # check size
-        self.assertEqual(inputs_shape, (1, 3, 800, 1088))
-
-        with no_grad():
-            outputs = model(**inputs)
-        # masks_queries_logits
-        masks_queries_logits = outputs.masks_queries_logits
-        self.assertEqual(
-            masks_queries_logits.shape,
-            (1, model.config.decoder_config.num_queries, inputs_shape[-2] // 4, inputs_shape[-1] // 4),
-        )
-        expected_slice = [
-            [-1.3737124, -1.7724937, -1.9364233],
-            [-1.5977281, -1.9867939, -2.1523695],
-            [-1.5795398, -1.9269832, -2.093942],
-        ]
-        expected_slice = mindspore.tensor(expected_slice)
-        self.assertTrue(ops.allclose(masks_queries_logits[0, 0, :3, :3], expected_slice, atol=TOLERANCE))
-        # class_queries_logits
-        class_queries_logits = outputs.class_queries_logits
-        self.assertEqual(
-            class_queries_logits.shape, (1, model.config.decoder_config.num_queries, model.config.num_labels + 1)
-        )
-        expected_slice = mindspore.tensor(
-            [
-                [1.6512e00, -5.2572e00, -3.3519e00],
-                [3.6169e-02, -5.9025e00, -2.9313e00],
-                [1.0766e-04, -7.7630e00, -5.1263e00],
-            ]
-        )
-        self.assertTrue(ops.allclose(outputs.class_queries_logits[0, :3, :3], expected_slice, atol=TOLERANCE))
-
-    def test_inference_instance_segmentation_head_resnet_backbone(self):
-        model = (
-            MaskFormerForInstanceSegmentation.from_pretrained("facebook/maskformer-resnet101-coco-stuff")
-            
-            .eval()
-        )
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(image, return_tensors="ms")
-        inputs_shape = inputs["pixel_values"].shape
-        # check size is divisible by 32
-        self.assertTrue((inputs_shape[-1] % 32) == 0 and (inputs_shape[-2] % 32) == 0)
-        # check size
-        self.assertEqual(inputs_shape, (1, 3, 800, 1088))
-
-        with no_grad():
-            outputs = model(**inputs)
-        # masks_queries_logits
-        masks_queries_logits = outputs.masks_queries_logits
-        self.assertEqual(
-            masks_queries_logits.shape,
-            (1, model.config.decoder_config.num_queries, inputs_shape[-2] // 4, inputs_shape[-1] // 4),
-        )
-        expected_slice = [[-0.9046, -2.6366, -4.6062], [-3.4179, -5.7890, -8.8057], [-4.9179, -7.6560, -10.7711]]
-        expected_slice = mindspore.tensor(expected_slice)
-        self.assertTrue(ops.allclose(masks_queries_logits[0, 0, :3, :3], expected_slice, atol=TOLERANCE))
-        # class_queries_logits
-        class_queries_logits = outputs.class_queries_logits
-        self.assertEqual(
-            class_queries_logits.shape, (1, model.config.decoder_config.num_queries, model.config.num_labels + 1)
-        )
-        expected_slice = mindspore.tensor(
-            [[4.7188, -3.2585, -2.8857], [6.6871, -2.9181, -1.2487], [7.2449, -2.2764, -2.1874]]
-        )
-        self.assertTrue(ops.allclose(outputs.class_queries_logits[0, :3, :3], expected_slice, atol=TOLERANCE))
-
-    @require_mindspore
-    def test_inference_fp16(self):
-        model = (
-            MaskFormerForInstanceSegmentation.from_pretrained("facebook/maskformer-resnet101-coco-stuff")
-            .to(dtype=mindspore.float16)
-            .eval()
-        )
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(image, return_tensors="ms").to(dtype=mindspore.float16)
-
-        with no_grad():
-            _ = model(**inputs)
-
-    def test_with_segmentation_maps_and_loss(self):
-        model = (
-            MaskFormerForInstanceSegmentation.from_pretrained("facebook/maskformer-swin-small-coco")
-            
-            .eval()
-        )
-        image_processor = self.default_image_processor
-
-        inputs = image_processor(
-            [np.zeros((3, 400, 333)), np.zeros((3, 400, 333))],
-            segmentation_maps=[np.zeros((384, 384)).astype(np.float32), np.zeros((384, 384)).astype(np.float32)],
-            return_tensors="ms",
-        )
-
-        inputs["pixel_values"] = inputs["pixel_values"]
-        inputs["mask_labels"] = [el for el in inputs["mask_labels"]]
-        inputs["class_labels"] = [el for el in inputs["class_labels"]]
-
-        with no_grad():
-            outputs = model(**inputs)
-
-        self.assertTrue(outputs.loss is not None)
\ No newline at end of file
diff --git a/tests/transformers/models/mbart/__init__.py b/tests/transformers/models/mbart/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mbart/test_modeling_mbart.py b/tests/transformers/models/mbart/test_modeling_mbart.py
deleted file mode 100644
index d5c2b4660..000000000
--- a/tests/transformers/models/mbart/test_modeling_mbart.py
+++ /dev/null
@@ -1,726 +0,0 @@
-# coding=utf-8
-# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch MBART model. """
-
-
-import copy
-import tempfile
-import unittest
-
-import numpy as np
-from mindnlp.transformers import MBartConfig
-from mindnlp.utils import is_mindspore_available, cached_property
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    # require_torch_fp16,
-    slow,
-)
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-    from mindnlp.transformers import (
-        AutoTokenizer,
-        MBartForCausalLM,
-        MBartForConditionalGeneration,
-        MBartForQuestionAnswering,
-        MBartForSequenceClassification,
-        MBartModel,
-    )
-    from mindnlp.transformers.tokenization_utils import BatchEncoding
-    from mindnlp.transformers.models.mbart.modeling_mbart import MBartDecoder, MBartEncoder
-
-def prepare_mbart_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones((config.encoder_layers, config.encoder_attention_heads), dtype=mindspore.int64)
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones((config.decoder_layers, config.decoder_attention_heads), dtype=mindspore.int64)
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones((config.decoder_layers, config.decoder_attention_heads), dtype=mindspore.int64)
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-class MBartModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=100,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-        # forcing a certain token to be generated, sets all other tokens to -inf
-        # if however the token to be generated is already at -inf then it can lead token
-        # `nan` values and thus break generation
-        self.forced_bos_token_id = None
-        self.forced_eos_token_id = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-        inputs_dict = prepare_mbart_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def get_config(self):
-        return MBartConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            forced_bos_token_id=self.forced_bos_token_id,
-            forced_eos_token_id=self.forced_eos_token_id,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = MBartModel(config=config).get_decoder()
-        model.set_train(False)
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-        head_mask = inputs_dict["head_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = ops.cat([attention_mask.astype(next_attn_mask.dtype), next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = MBartModel(config=config).set_train(False)
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = MBartEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
-            0
-        ]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = MBartDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-@require_mindspore
-class MBartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (MBartForConditionalGeneration, MBartForSequenceClassification, MBartForQuestionAnswering)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (MBartForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "conversational": MBartForConditionalGeneration,
-            "feature-extraction": MBartModel,
-            "fill-mask": MBartForConditionalGeneration,
-            "question-answering": MBartForQuestionAnswering,
-            "summarization": MBartForConditionalGeneration,
-            "text-classification": MBartForSequenceClassification,
-            "text-generation": MBartForCausalLM,
-            "text2text-generation": MBartForConditionalGeneration,
-            "translation": MBartForConditionalGeneration,
-            "zero-shot": MBartForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    fx_compatible = False  # Fix me Michael
-    test_pruning = False
-    test_missing_keys = False
-    
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = MBartModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MBartConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-    
-
-    # MBartForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (MBartModel, MBartForConditionalGeneration, MBartForQuestionAnswering):
-            model = model_class(config)
-            model.set_train(False)
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-                model(**inputs)[0]
-
-    @require_mindspore
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = MBartForConditionalGeneration(config).set_train(False)
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-    @unittest.skip("test_ensure_weights_are_shared")
-    def test_ensure_weights_are_shared(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-
-        config.tie_word_embeddings = True
-        model = MBartForConditionalGeneration(config)
-
-        # MBart shares four weights.
-        # Not an issue to not have these correctly tied for torch.load, but it is an issue for safetensors.
-        self.assertEqual(
-            len(
-                {
-                    model.get_output_embeddings().weight.data_ptr(),
-                    model.get_input_embeddings().weight.data_ptr(),
-                    model.base_model.decoder.embed_tokens.weight.data_ptr(),
-                    model.base_model.encoder.embed_tokens.weight.data_ptr(),
-                }
-            ),
-            1,
-        )
-
-        config.tie_word_embeddings = False
-        model = MBartForConditionalGeneration(config)
-
-        # MBart shares four weights.
-        # Not an issue to not have these correctly tied for torch.load, but it is an issue for safetensors.
-        self.assertEqual(
-            len(
-                {
-                    model.get_output_embeddings().weight.data_ptr(),
-                    model.get_input_embeddings().weight.data_ptr(),
-                    model.base_model.decoder.embed_tokens.weight.data_ptr(),
-                    model.base_model.encoder.embed_tokens.weight.data_ptr(),
-                }
-            ),
-            2,
-        )
-
-
-def assert_tensors_close(a, b, atol=1e-12, prefix=""):
-    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if np.allclose(a.asnumpy(), b.asnumpy(), atol=atol):
-            return True
-        raise
-    except Exception:
-        pct_different = (ops.gt((a - b).abs(), atol)).float().mean().item()
-        if a.size > 100:
-            msg = f"tensor values are {pct_different:.1%} percent different."
-        else:
-            msg = f"{a} != {b}"
-        if prefix:
-            msg = prefix + ": " + msg
-        raise AssertionError(msg)
-
-
-def _long_tensor(tok_lst):
-    return mindspore.tensor(tok_lst, dtype=mindspore.int64)
-
-
-@require_mindspore
-class AbstractSeq2SeqIntegrationTest(unittest.TestCase):
-    maxDiff = 1000  # longer string compare tracebacks
-    checkpoint_name = None
-
-    @classmethod
-    def setUpClass(cls):
-        cls.tokenizer = AutoTokenizer.from_pretrained(cls.checkpoint_name, use_fast=False)
-        return cls
-
-    @cached_property
-    def model(self):
-        """Only load the model if needed."""
-        model = MBartForConditionalGeneration.from_pretrained(self.checkpoint_name)
-        model = model.half()
-        return model
-
-
-@require_mindspore
-class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest):
-    checkpoint_name = "facebook/mbart-large-en-ro"
-    src_text = [
-        " UN Chief Says There Is No Military Solution in Syria",
-        """ Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that "there is no military solution" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people.""",
-    ]
-    tgt_text = [
-        "Şeful ONU declară că nu există o soluţie militară în Siria",
-        "Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei"
-        ' pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor'
-        " face decât să înrăutăţească violenţa şi mizeria pentru milioane de oameni.",
-    ]
-    expected_src_tokens = [8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2, 250004]
-
-    @slow
-    def test_enro_generate_one(self):
-        batch: BatchEncoding = self.tokenizer(
-            ["UN Chief Says There Is No Military Solution in Syria"], return_tensors="ms"
-        )
-        translated_tokens = self.model.generate(**batch)
-        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
-        self.assertEqual(self.tgt_text[0], decoded[0])
-        # self.assertEqual(self.tgt_text[1], decoded[1])
-
-    @slow
-    def test_enro_generate_batch(self):
-        batch: BatchEncoding = self.tokenizer(self.src_text, return_tensors="ms", padding=True, truncation=True)
-        translated_tokens = self.model.generate(**batch)
-        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
-        assert self.tgt_text == decoded
-
-    def test_mbart_enro_config(self):
-        mbart_models = ["facebook/mbart-large-en-ro"]
-        expected = {"scale_embedding": True, "output_past": True}
-        for name in mbart_models:
-            config = MBartConfig.from_pretrained(name)
-            for k, v in expected.items():
-                try:
-                    self.assertEqual(v, getattr(config, k))
-                except AssertionError as e:
-                    e.args += (name, k)
-                    raise
-
-    def test_mbart_fast_forward(self):
-        config = MBartConfig(
-            vocab_size=99,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            add_final_layer_norm=True,
-        )
-        lm_model = MBartForConditionalGeneration(config)
-        context = mindspore.tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]])
-        summary = mindspore.tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]]).to(mindspore.int32)
-        result = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary)
-        expected_shape = (*summary.shape, config.vocab_size)
-        self.assertEqual(result.logits.shape, expected_shape)
-
-
-@require_mindspore
-class MBartCC25IntegrationTest(AbstractSeq2SeqIntegrationTest):
-    checkpoint_name = "facebook/mbart-large-cc25"
-    src_text = [
-        " UN Chief Says There Is No Military Solution in Syria",
-        " I ate lunch twice yesterday",
-    ]
-    tgt_text = ["Şeful ONU declară că nu există o soluţie militară în Siria", "to be padded"]
-
-    @unittest.skip("This test is broken, still generates english")
-    def test_cc25_generate(self):
-        inputs = self.tokenizer([self.src_text[0]], return_tensors="ms")
-        translated_tokens = self.model.generate(
-            input_ids=inputs["input_ids"],
-            decoder_start_token_id=self.tokenizer.lang_code_to_id["ro_RO"],
-        )
-        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
-        self.assertEqual(self.tgt_text[0], decoded[0])
-
-    @slow
-    def test_fill_mask(self):
-        inputs = self.tokenizer(["One of the best <mask> I ever read!"], return_tensors="ms")
-        outputs = self.model.generate(
-            inputs["input_ids"], decoder_start_token_id=self.tokenizer.lang_code_to_id["en_XX"], num_beams=1
-        )
-        prediction: str = self.tokenizer.batch_decode(
-            outputs, clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )[0]
-        self.assertEqual(prediction, "of the best books I ever read!")
-
-
-class MBartStandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        d_model=16,
-        decoder_seq_length=7,
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=2,
-        decoder_ffn_dim=32,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=False,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.is_encoder_decoder = is_encoder_decoder
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = MBartConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_attention_heads=self.encoder_attention_heads,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        )
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = MBartDecoder(config=config).set_train(False)
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3)
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        model = MBartDecoder(config=config)
-
-        model.set_train(False)
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            axis=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class MBartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (MBartDecoder, MBartForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (MBartForCausalLM,) if is_mindspore_available() else ()
-    test_pruning = False
-    is_encoder_decoder = False
-
-    def setUp(self,):
-        self.model_tester = MBartStandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=MBartConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_attn_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
-        return
-
-    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
-    def test_left_padding_compatibility(self):
-        pass
diff --git a/tests/transformers/models/mbart50/__init__.py b/tests/transformers/models/mbart50/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mbart50/test_tokenization_mbart50.py b/tests/transformers/models/mbart50/test_tokenization_mbart50.py
deleted file mode 100644
index e7fe1f106..000000000
--- a/tests/transformers/models/mbart50/test_tokenization_mbart50.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# coding=utf-8
-# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch MBART50 model. """
-
-import shutil
-import tempfile
-import unittest
-
-from mindnlp.transformers.tokenization_utils import BatchEncoding
-from mindnlp.transformers import SPIECE_UNDERLINE, MBart50Tokenizer, MBart50TokenizerFast
-from mindnlp.utils.testing_utils import (
-    get_tests_dir,
-    nested_simplify,
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    is_mindspore_available,
-    slow,
-)
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
-
-if is_mindspore_available():
-    from mindnlp.transformers.models.mbart.modeling_mbart import shift_tokens_right
-
-EN_CODE = 250004
-RO_CODE = 250020
-
-
-@require_sentencepiece
-@require_tokenizers
-class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "facebook/mbart-large-50-one-to-many-mmt"
-    tokenizer_class = MBart50Tokenizer
-    rust_tokenizer_class = MBart50TokenizerFast
-    test_rust_tokenizer = True
-    test_sentencepiece = True
-
-    def setUp(self):
-        super().setUp()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = "<s>"
-        token_id = 0
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "<s>")
-        self.assertEqual(vocab_keys[1], "<pad>")
-        self.assertEqual(vocab_keys[-1], "<mask>")
-        self.assertEqual(len(vocab_keys), 1_054)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 1_054)
-
-    def test_full_tokenizer(self):
-        tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
-
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens),
-            [value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382]],
-        )
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens,[SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", "."])  # fmt: skip
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(
-            ids,
-            [
-                value + tokenizer.fairseq_offset
-                for value in [8, 21, 84, 55, 24, 19, 7, 2, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 2, 4]
-            ],
-        )
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens,[SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "<unk>", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "<unk>", "."],)  # fmt: skip
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[250004, 11062, 82772, 7, 15, 82772, 538, 51529, 237, 17198, 1290, 206, 9, 215175, 1314, 136, 17198, 1290, 206, 9, 56359, 42, 122009, 9, 16466, 16, 87344, 4537, 9, 4717, 78381, 6, 159958, 7, 15, 24480, 618, 4, 527, 22693, 5428, 4, 2777, 24480, 9874, 4, 43523, 594, 4, 803, 18392, 33189, 18, 4, 43523, 24447, 12399, 100, 24955, 83658, 9626, 144057, 15, 839, 22335, 16, 136, 24955, 83658, 83479, 15, 39102, 724, 16, 678, 645, 2789, 1328, 4589, 42, 122009, 115774, 23, 805, 1328, 46876, 7, 136, 53894, 1940, 42227, 41159, 17721, 823, 425, 4, 27512, 98722, 206, 136, 5531, 4970, 919, 17336, 5, 2], [250004, 20080, 618, 83, 82775, 47, 479, 9, 1517, 73, 53894, 333, 80581, 110117, 18811, 5256, 1295, 51, 152526, 297, 7986, 390, 124416, 538, 35431, 214, 98, 15044, 25737, 136, 7108, 43701, 23, 756, 135355, 7, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [250004, 581, 63773, 119455, 6, 147797, 88203, 7, 645, 70, 21, 3285, 10269, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="facebook/mbart-large-50",
-            revision="d3913889c59cd5c9e456b269c376325eabad57e2",
-        )
-
-    # overwrite from test_tokenization_common to speed up test
-    def test_save_pretrained(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
-
-        self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart50", {})
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs, from_pt=True)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs, from_pt=True)
-
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it save with the same files + the tokenizer.json file for the fast one
-                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
-                tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
-                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2, from_pt=True)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2, from_pt=True)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-                    # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
-                    # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
-
-                shutil.rmtree(tmpdirname2)
-
-                # Save tokenizer rust, legacy_format=True
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it save with the same files
-                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2, from_pt=True)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2, from_pt=True)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
-                # Save tokenizer rust, legacy_format=False
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it saved the tokenizer.json file
-                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2, from_pt=True)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2, from_pt=True)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class MBart50OneToManyIntegrationTest(unittest.TestCase):
-    checkpoint_name = "facebook/mbart-large-50-one-to-many-mmt"
-    src_text = [
-        " UN Chief Says There Is No Military Solution in Syria",
-        """ Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that "there is no military solution" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people.""",
-    ]
-    tgt_text = [
-        "Şeful ONU declară că nu există o soluţie militară în Siria",
-        "Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei"
-        ' pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor'
-        " face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.",
-    ]
-    expected_src_tokens = [EN_CODE, 8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2]
-
-    @classmethod
-    def setUpClass(cls):
-        cls.tokenizer: MBart50Tokenizer = MBart50Tokenizer.from_pretrained(
-            cls.checkpoint_name, src_lang="en_XX", tgt_lang="ro_RO"
-        )
-        cls.pad_token_id = 1
-        return cls
-
-    def check_language_codes(self):
-        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ar_AR"], 250001)
-        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["en_EN"], 250004)
-        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ro_RO"], 250020)
-        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["mr_IN"], 250038)
-
-    def test_tokenizer_batch_encode_plus(self):
-        ids = self.tokenizer.batch_encode_plus(self.src_text).input_ids[0]
-        self.assertListEqual(self.expected_src_tokens, ids)
-
-    def test_tokenizer_decode_ignores_language_codes(self):
-        self.assertIn(RO_CODE, self.tokenizer.all_special_ids)
-        generated_ids = [RO_CODE, 884, 9019, 96, 9, 916, 86792, 36, 18743, 15596, 5, 2]
-        result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
-        expected_romanian = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
-        self.assertEqual(result, expected_romanian)
-        self.assertNotIn(self.tokenizer.eos_token, result)
-
-    def test_tokenizer_truncation(self):
-        src_text = ["this is gunna be a long sentence " * 20]
-        assert isinstance(src_text[0], str)
-        desired_max_length = 10
-        ids = self.tokenizer(src_text, max_length=desired_max_length, truncation=True).input_ids[0]
-        self.assertEqual(ids[0], EN_CODE)
-        self.assertEqual(ids[-1], 2)
-        self.assertEqual(len(ids), desired_max_length)
-
-    def test_mask_token(self):
-        self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["<mask>", "ar_AR"]), [250053, 250001])
-
-    def test_special_tokens_unaffacted_by_save_load(self):
-        tmpdirname = tempfile.mkdtemp()
-        original_special_tokens = self.tokenizer.fairseq_tokens_to_ids
-        self.tokenizer.save_pretrained(tmpdirname)
-        new_tok = MBart50Tokenizer.from_pretrained(tmpdirname)
-        self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens)
-
-    @require_mindspore
-    def test_batch_fairseq_parity(self):
-        batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="ms")
-        batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
-
-        # fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4
-        assert batch.input_ids[1][0] == EN_CODE
-        assert batch.input_ids[1][-1] == 2
-        assert batch.labels[1][0] == RO_CODE
-        assert batch.labels[1][-1] == 2
-        assert batch.decoder_input_ids[1][:2].tolist() == [2, RO_CODE]
-
-    @require_mindspore
-    def test_tokenizer_prepare_batch(self):
-        batch = self.tokenizer(
-            self.src_text,
-            text_target=self.tgt_text,
-            padding=True,
-            truncation=True,
-            max_length=len(self.expected_src_tokens),
-            return_tensors="ms",
-        )
-        batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
-
-        self.assertIsInstance(batch, BatchEncoding)
-
-        self.assertEqual((2, 14), batch.input_ids.shape)
-        self.assertEqual((2, 14), batch.attention_mask.shape)
-        result = batch.input_ids.tolist()[0]
-        self.assertListEqual(self.expected_src_tokens, result)
-        self.assertEqual(2, batch.decoder_input_ids[0, 0])  # decoder_start_token_id
-        # Test that special tokens are reset
-        self.assertEqual(self.tokenizer.prefix_tokens, [EN_CODE])
-        self.assertEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
-
-    def test_seq2seq_max_target_length(self):
-        batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="ms")
-        targets = self.tokenizer(
-            text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="ms"
-        )
-        labels = targets["input_ids"]
-        batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
-
-        self.assertEqual(batch.input_ids.shape[1], 3)
-        self.assertEqual(batch.decoder_input_ids.shape[1], 10)
-
-    @require_mindspore
-    def test_tokenizer_translation(self):
-        inputs = self.tokenizer._build_translation_inputs(
-            "A test", return_tensors="ms", src_lang="en_XX", tgt_lang="ar_AR"
-        )
-
-        self.assertEqual(
-            nested_simplify(inputs),
-            {
-                # en_XX, A, test, EOS
-                "input_ids": [[250004, 62, 3034, 2]],
-                "attention_mask": [[1, 1, 1, 1]],
-                # ar_AR
-                "forced_bos_token_id": 250001,
-            },
-        )
diff --git a/tests/transformers/models/mctct/__init__.py b/tests/transformers/models/mctct/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mctct/test_feature_extraction_mctct.py b/tests/transformers/models/mctct/test_feature_extraction_mctct.py
deleted file mode 100644
index dba0603cc..000000000
--- a/tests/transformers/models/mctct/test_feature_extraction_mctct.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import itertools
-import random
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import MCTCTFeatureExtractor
-from mindnlp.utils.testing_utils import require_mindspore
-
-from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
-
-
-global_rng = random.Random()
-
-
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for _batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-
-@require_mindspore
-class MCTCTFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        min_seq_length=400,
-        max_seq_length=2000,
-        feature_size=24,
-        num_mel_bins=24,
-        padding_value=0.0,
-        sampling_rate=16_000,
-        return_attention_mask=True,
-        do_normalize=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.min_seq_length = min_seq_length
-        self.max_seq_length = max_seq_length
-        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
-        self.feature_size = feature_size
-        self.num_mel_bins = num_mel_bins
-        self.padding_value = padding_value
-        self.sampling_rate = sampling_rate
-        self.return_attention_mask = return_attention_mask
-        self.do_normalize = do_normalize
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "feature_size": self.feature_size,
-            "num_mel_bins": self.num_mel_bins,
-            "padding_value": self.padding_value,
-            "sampling_rate": self.sampling_rate,
-            "return_attention_mask": self.return_attention_mask,
-            "do_normalize": self.do_normalize,
-        }
-
-    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
-        def _flatten(list_of_lists):
-            return list(itertools.chain(*list_of_lists))
-
-        if equal_length:
-            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
-        else:
-            # make sure that inputs increase in size
-            speech_inputs = [
-                floats_list((x, self.feature_size))
-                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
-            ]
-        if numpify:
-            speech_inputs = [np.asarray(x) for x in speech_inputs]
-        return speech_inputs
-
-
-@require_mindspore
-class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = MCTCTFeatureExtractor
-
-    def setUp(self):
-        self.feat_extract_tester = MCTCTFeatureExtractionTester(self)
-
-    def _check_zero_mean_unit_variance(self, input_vector):
-        self.assertTrue(np.all(np.mean(input_vector) < 1e-3))
-        self.assertTrue(np.all(np.abs(np.var(input_vector) - 1) < 1e-3))
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        # create three inputs of length 800, 1000, and 1200
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        # Test feature size
-        input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features
-        self.assertTrue(input_features.ndim == 3)
-        self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size)
-
-        # Test not batched input
-        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
-        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
-
-        # Test batched
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-        # Test 2-D numpy arrays are batched.
-        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
-        np_speech_inputs = np.asarray(speech_inputs)
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-    def test_cepstral_mean_and_variance_normalization(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-
-        paddings = ["longest", "max_length", "do_not_pad"]
-        max_lengths = [None, 16, None]
-        for max_length, padding in zip(max_lengths, paddings):
-            inputs = feature_extractor(
-                speech_inputs,
-                padding=padding,
-                max_length=max_length,
-                return_attention_mask=True,
-                truncation=max_length is not None,  # reference to #16419
-            )
-            input_features = inputs.input_features
-            attention_mask = inputs.attention_mask
-            fbank_feat_lengths = [np.sum(x) for x in attention_mask]
-            self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
-            self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
-            self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
-
-    def test_cepstral_mean_and_variance_normalization_np(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-
-        paddings = ["longest", "max_length", "do_not_pad"]
-        max_lengths = [None, 16, None]
-        for max_length, padding in zip(max_lengths, paddings):
-            inputs = feature_extractor(
-                speech_inputs,
-                max_length=max_length,
-                padding=padding,
-                return_tensors="np",
-                return_attention_mask=True,
-                truncation=max_length is not None,
-            )
-            input_features = inputs.input_features
-            attention_mask = inputs.attention_mask
-            fbank_feat_lengths = [np.sum(x) for x in attention_mask]
-
-            self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
-            self.assertTrue(input_features[0][fbank_feat_lengths[0] :].sum() < 1e-6)
-            self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
-            self.assertTrue(input_features[0][fbank_feat_lengths[1] :].sum() < 1e-6)
-            self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
-
-    def test_cepstral_mean_and_variance_normalization_trunc_max_length(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="max_length",
-            max_length=4,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1])
-        self._check_zero_mean_unit_variance(input_features[2])
-
-    def test_cepstral_mean_and_variance_normalization_trunc_longest(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="longest",
-            max_length=4,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
-        self._check_zero_mean_unit_variance(input_features[2])
-
-        # make sure that if max_length < longest -> then pad to max_length
-        self.assertEqual(input_features.shape, (3, 4, 24))
-
-        speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="longest",
-            max_length=16,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
-        self._check_zero_mean_unit_variance(input_features[2])
-
-        # make sure that if max_length < longest -> then pad to max_length
-        self.assertEqual(input_features.shape, (3, 16, 24))
-
-    def test_double_precision_pad(self):
-        import mindspore as ms
-
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
-        py_speech_inputs = np_speech_inputs.tolist()
-
-        for inputs in [py_speech_inputs, np_speech_inputs]:
-            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
-            self.assertTrue(np_processed.input_features.dtype == np.float32)
-            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="ms")
-            self.assertTrue(pt_processed.input_features.dtype == ms.float32)
-
-    def test_different_window(self):
-        import mindspore as ms
-
-        init_dict = self.feat_extract_tester.prepare_feat_extract_dict()
-        init_dict["win_function"] = "hann_window"
-
-        feature_extractor = self.feature_extraction_class(**init_dict)
-        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
-        py_speech_inputs = np_speech_inputs.tolist()
-
-        for inputs in [py_speech_inputs, np_speech_inputs]:
-            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
-            self.assertTrue(np_processed.input_features.dtype == np.float32)
-            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="ms")
-            self.assertTrue(pt_processed.input_features.dtype == ms.float32)
-
-    def _load_datasamples(self, num_samples):
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_integration(self):
-        # fmt: off
-        expected = np.array([
-            [
-                1.1280,  1.1319,  1.2744,  1.4369,  1.4328,  1.3671,  1.2889,  1.3046,
-                1.4419,  0.8387,  0.2995,  0.0404,  0.1068,  0.0472,  0.3728,  1.3356,
-                1.4491,  0.4770,  0.3997,  0.2776,  0.3184, -0.1243, -0.1170, -0.0828
-            ],
-            [
-                1.0826,  1.0565,  1.2110,  1.3886,  1.3416,  1.2009,  1.1894,  1.2707,
-                1.5153,  0.7005,  0.4916,  0.4017,  0.3743,  0.1935,  0.4228,  1.1084,
-                0.9768,  0.0608,  0.2044,  0.1723,  0.0433, -0.2360, -0.2478, -0.2643
-            ],
-            [
-                1.0590,  0.9923,  1.1185,  1.3309,  1.1971,  1.0067,  1.0080,  1.2036,
-                1.5397,  1.0383,  0.7672,  0.7551,  0.4878,  0.8771,  0.7565,  0.8775,
-                0.9042,  0.4595,  0.6157,  0.4954,  0.1857,  0.0307,  0.0199,  0.1033
-            ],
-        ])
-        # fmt: on
-
-        input_speech = self._load_datasamples(1)
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        input_features = feature_extractor(input_speech, sampling_rate=16000, return_tensors="ms").input_features
-        self.assertTrue(np.allclose(input_features[0, 100:103].asnumpy(), expected, atol=1e-4))
diff --git a/tests/transformers/models/mctct/test_modeling_mctct.py b/tests/transformers/models/mctct/test_modeling_mctct.py
deleted file mode 100644
index 6d6993727..000000000
--- a/tests/transformers/models/mctct/test_modeling_mctct.py
+++ /dev/null
@@ -1,653 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the Mindspore MCTCT model. """
-
-import inspect
-import math
-import unittest
-import numpy as np
-
-from datasets import load_dataset
-
-from mindnlp.transformers import MCTCTConfig
-from mindnlp.utils.testing_utils import require_soundfile, require_mindspore, slow, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindspore import ops
-
-    from mindnlp.transformers import MCTCTForCTC, MCTCTModel, MCTCTProcessor
-
-
-class MCTCTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=10,
-        seq_length=40,  # speech is longer
-        is_training=False,
-        vocab_size=32,
-        hidden_size=128 * 4,
-        num_hidden_layers=4,
-        intermediate_size=20,
-        num_attention_heads=4,
-        attention_head_dim=128,
-        max_position_embeddings=920,
-        layer_norm_eps=1e-5,
-        layerdrop=0.3,
-        hidden_act="relu",
-        initializer_range=0.02,
-        hidden_dropout_prob=0.3,
-        attention_probs_dropout_prob=0.3,
-        conv_glu_dim=1,
-        conv_dropout=0.3,
-        num_conv_layers=1,
-        conv_kernel=(7,),
-        conv_stride=(3,),
-        input_feat_per_channel=80,
-        input_channels=1,
-        conv_channels=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length  # speech is longer
-        self.is_training = is_training
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.num_attention_heads = num_attention_heads
-
-        self.attention_head_dim = attention_head_dim
-        self.max_position_embeddings = max_position_embeddings
-
-        self.layer_norm_eps = layer_norm_eps
-        self.layerdrop = layerdrop
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-
-        self.conv_glu_dim = conv_glu_dim
-        self.conv_dropout = conv_dropout
-        self.num_conv_layers = num_conv_layers
-        self.conv_kernel = conv_kernel
-        self.conv_stride = conv_stride
-        self.input_feat_per_channel = input_feat_per_channel
-        self.input_channels = input_channels
-        self.conv_channels = conv_channels
-
-        output_seq_length = self.seq_length
-        dilation = 1
-        for _, kernel_sz, stride in zip(range(self.num_conv_layers), self.conv_kernel, self.conv_stride):
-            padding = kernel_sz // 2
-            output_seq_length = output_seq_length + 2 * padding - dilation * (kernel_sz - 1) - 1
-            output_seq_length = output_seq_length // stride + 1
-
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor(
-            [self.batch_size, self.seq_length, self.input_feat_per_channel], self.vocab_size
-        )
-        attention_mask = ops.ones([self.batch_size, self.seq_length], dtype=ms.int64)
-
-        config = self.get_config()
-
-        return config, input_features, attention_mask
-
-    def get_config(self):
-        return MCTCTConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            intermediate_size=self.intermediate_size,
-            num_attention_heads=self.num_attention_heads,
-            attention_head_dim=self.attention_head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            layer_norm_eps=self.layer_norm_eps,
-            layerdrop=self.layerdrop,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            conv_glu_dim=self.conv_glu_dim,
-            conv_dropout=self.conv_dropout,
-            num_conv_layers=self.num_conv_layers,
-            conv_kernel=self.conv_kernel,
-            conv_stride=self.conv_stride,
-            input_feat_per_channel=self.input_feat_per_channel,
-            input_channels=self.input_channels,
-            conv_channels=self.conv_channels,
-        )
-
-    def create_and_check_model(self, config, input_features, attention_mask):
-        model = MCTCTModel(config=config)
-        model.set_train(False)
-        result = model(input_features, attention_mask=attention_mask)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_for_ctc(self, config, input_features, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 2 * config.hidden_size
-        model = MCTCTForCTC(config=config)
-        model.set_train(False)
-        result = model(input_features, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.adapter_output_seq_length, self.vocab_size)
-        )
-
-    def create_and_check_batch_inference(self, config, input_features, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = MCTCTModel(config=config)
-        model.set_train(False)
-
-        input_features = input_features[:3]
-        attention_mask = ops.ones(input_features.shape[:-1], dtype=ms.bool_)
-
-        input_lengths = [input_features.shape[-1] // i for i in [2, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_features, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_features.shape[0]):
-            input_slice = input_features[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(np.allclose(output.asnumpy(), batch_output.asnumpy(), atol=1e-3))
-
-    def check_ctc_loss(self, config, input_features, *args):
-        model = MCTCTForCTC(config=config)
-
-
-        # make sure that dropout is disabled
-        model.set_train(False)
-
-        input_features = input_features[:3]
-
-        # input_features is a 2D window for each sequence
-        attention_mask = ops.ones(input_features.shape[:-1], dtype=ms.int64)
-
-        # -2 since input_features is a 2D window for each sequence in batch
-        input_lengths = [input_features.shape[-2] // i for i in [2, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(ms.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], (min(max_length_labels) - 1).item()), model.config.vocab_size)
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_features, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_features, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_ctc_training(self, config, input_features, *args):
-        config.ctc_zero_infinity = True
-        model = MCTCTForCTC(config=config)
-        model.set_train(True)
-
-        input_features = input_features[:3]
-
-        input_lengths = [input_features.shape[-2] // i for i in [2, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(ms.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], (max(max_length_labels) - 1).item()), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lenghts are at least
-                # one shorter than logit lenghts to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_features, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        # loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_features, *args):
-        model = MCTCTForCTC(config)
-        # model.to(torch_device)
-        # model.train()
-        # model.set_train(True)
-        input_features = input_features[:3]
-
-        input_lengths = [input_features.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(ms.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], (max(max_length_labels) - 2).item()), model.config.vocab_size + 100)
-
-        with self.parent.assertRaises(ValueError):
-            model(input_features, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_features, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_features": input_features, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class MCTCTModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (MCTCTForCTC,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = MCTCTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MCTCTConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # MCTCT has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_features`
-    def test_forward_signature(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_features",
-                "attention_mask",
-                "head_mask",
-                "output_attentions",
-                "output_hidden_states",
-                "return_dict",
-            ]
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    # MCTCT cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # MCTCT has no inputs_embeds
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_model_is_small(self):
-        pass
-
-
-    @unittest.skip("MindSpore has no retain grad")
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-        config.layerdrop = 0.0
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        input_features = inputs_dict["input_features"]
-
-        input_lengths = ms.tensor(
-            [input_features.shape[1] for _ in range(input_features.shape[0])], dtype=ms.int64
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_features.shape[0], (output_lengths[0] - 2).item()), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = ops.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        # hidden_states.retain_grad()
-        # attentions.retain_grad()
-
-        # output.flatten()[0].backward(retain_graph=True)
-
-        # self.assertIsNotNone(hidden_states.grad)
-        # self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = MCTCTModel.from_pretrained("speechbrain/m-ctc-t-large")
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class MCTCTRobustModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (MCTCTForCTC,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = MCTCTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MCTCTConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_batched_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # MCTCT has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_features`
-    def test_forward_signature(self):
-        pass
-
-    # MCTCT cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # MCTCT has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip("MindSpore has no retain grad")
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_features = inputs_dict["input_features"]
-
-        input_lengths = ms.tensor(
-            [input_features.shape[1] for _ in range(input_features.shape[0])], dtype=ms.int64
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_features.shape[0], (output_lengths[0] - 2).item()), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = ops.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = MCTCTModel.from_pretrained("speechbrain/m-ctc-t-large")
-        self.assertIsNotNone(model)
-
-    def test_model_is_small(self):
-        pass
-
-
-@require_mindspore
-@require_soundfile
-@slow
-class MCTCTModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_inference_ctc_normal(self):
-        model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")
-        processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
-        input_speech = self._load_datasamples(1)
-
-        input_features = processor(input_speech, return_tensors="ms").input_features
-
-        model.set_train(False)
-
-        logits = model(input_features).logits
-
-        predicted_ids = ops.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe, sir, i exist."]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-
-    def test_inference_ctc_normal_batched(self):
-        model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")
-
-        processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True)
-
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-
-        model.set_train(False)
-        logits = model(input_features, attention_mask=attention_mask).logits
-
-        predicted_ids = ops.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe, sir, i exist.",
-            '"sweat-covered brion\'s body, trickling into the tight-lowing clossa was the only germent huor."',
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_robust_batched(self):
-        model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")
-        processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large", do_lower_case=True)
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True, return_attention_mask=True)
-
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-
-        model.set_train(False)
-
-        logits = model(input_features, attention_mask=attention_mask).logits
-
-        predicted_ids = ops.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe, sir, i exist.",
-            '"sweat-covered brion\'s body, trickling into the tight-lowing clossa was the only germent huor." "',
-            "\"the cadona's chest still-dripping bloodthe acofis overstrained eyes, even the soring arena around him"
-            " with thousands of spectators retrivialities not worth-thinking about.",
-            "his instant panic was followed by a small sharp blow high on his chestr.",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
diff --git a/tests/transformers/models/mctct/test_processor_mctct.py b/tests/transformers/models/mctct/test_processor_mctct.py
deleted file mode 100644
index 771b460b4..000000000
--- a/tests/transformers/models/mctct/test_processor_mctct.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-
-from mindnlp.transformers import MCTCTProcessor
-from mindnlp.configs import FEATURE_EXTRACTOR_NAME
-from mindnlp.transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizer
-from mindnlp.utils.testing_utils import require_mindspore, is_mindspore_available
-
-
-if is_mindspore_available():
-    from mindnlp.transformers import MCTCTFeatureExtractor
-
-    from .test_feature_extraction_mctct import floats_list
-
-
-@require_mindspore
-class MCTCTProcessorTest(unittest.TestCase):
-    def setUp(self):
-        vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        self.add_kwargs_tokens_map = {
-            "pad_token": "<pad>",
-            "unk_token": "<unk>",
-            "bos_token": "<s>",
-            "eos_token": "</s>",
-        }
-        feature_extractor_map = {
-            "feature_size": 1,
-            "padding_value": 0.0,
-            "sampling_rate": 16000,
-            "return_attention_mask": False,
-            "do_normalize": True,
-        }
-
-        self.tmpdirname = tempfile.mkdtemp()
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-
-        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(feature_extractor_map) + "\n")
-
-    def get_tokenizer(self, **kwargs_init):
-        kwargs = self.add_kwargs_tokens_map.copy()
-        kwargs.update(kwargs_init)
-        return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_feature_extractor(self, **kwargs):
-        return MCTCTFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def test_save_load_pretrained_default(self):
-        tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        processor.save_pretrained(self.tmpdirname)
-        processor = MCTCTProcessor.from_pretrained(self.tmpdirname)
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, MCTCTFeatureExtractor)
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = MCTCTProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
-
-        processor = MCTCTProcessor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, MCTCTFeatureExtractor)
-
-    def test_feature_extractor(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        raw_speech = floats_list((3, 1000))
-
-        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
-        input_processor = processor(raw_speech, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_tokenizer(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        input_str = "This is a test string"
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tok = tokenizer(input_str)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_tokenizer_decode(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MCTCTProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        self.assertListEqual(
-            processor.model_input_names,
-            feature_extractor.model_input_names,
-            msg="`processor` and `feature_extractor` model input names do not match",
-        )
diff --git a/tests/transformers/models/megatron_bert/__init__.py b/tests/transformers/models/megatron_bert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/megatron_bert/test_modeling_megatron_bert.py b/tests/transformers/models/megatron_bert/test_modeling_megatron_bert.py
deleted file mode 100644
index 4d03f4223..000000000
--- a/tests/transformers/models/megatron_bert/test_modeling_megatron_bert.py
+++ /dev/null
@@ -1,382 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-# Copyright 2021 NVIDIA Corporation. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore MegatronBERT model."""
-
-import math
-import os
-import unittest
-
-from mindnlp.transformers import MegatronBertConfig, is_mindspore_available
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import require_sentencepiece, require_tokenizers, require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        MegatronBertForCausalLM,
-        MegatronBertForMaskedLM,
-        MegatronBertForMultipleChoice,
-        MegatronBertForNextSentencePrediction,
-        MegatronBertForPreTraining,
-        MegatronBertForQuestionAnswering,
-        MegatronBertForSequenceClassification,
-        MegatronBertForTokenClassification,
-        MegatronBertModel,
-    )
-
-
-class MegatronBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=64,
-        embedding_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.embedding_size = embedding_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return MegatronBertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            embedding_size=self.embedding_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_megatron_bert_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegatronBertModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_megatron_bert_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegatronBertForMaskedLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_causal_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegatronBertForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_megatron_bert_for_next_sequence_prediction(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegatronBertForNextSentencePrediction(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
-
-    def create_and_check_megatron_bert_for_pretraining(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegatronBertForPreTraining(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            next_sentence_label=sequence_labels,
-        )
-        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
-
-    def create_and_check_megatron_bert_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegatronBertForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_megatron_bert_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = MegatronBertForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_megatron_bert_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = MegatronBertForTokenClassification(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_megatron_bert_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = MegatronBertForMultipleChoice(config=config)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class MegatronBertModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            MegatronBertModel,
-            MegatronBertForMaskedLM,
-            MegatronBertForCausalLM,
-            MegatronBertForMultipleChoice,
-            MegatronBertForNextSentencePrediction,
-            MegatronBertForPreTraining,
-            MegatronBertForQuestionAnswering,
-            MegatronBertForSequenceClassification,
-            MegatronBertForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": MegatronBertModel,
-            "fill-mask": MegatronBertForMaskedLM,
-            "question-answering": MegatronBertForQuestionAnswering,
-            "text-classification": MegatronBertForSequenceClassification,
-            "text-generation": MegatronBertForCausalLM,
-            "token-classification": MegatronBertForTokenClassification,
-            "zero-shot": MegatronBertForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = True
-    # test_resize_embeddings = False
-    test_head_masking = False
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int64
-                )
-                inputs_dict["next_sentence_label"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = MegatronBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MegatronBertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_megatron_bert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_megatron_bert_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_megatron_bert_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_megatron_bert_for_multiple_choice(*config_and_inputs)
-
-    def test_for_next_sequence_prediction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_megatron_bert_for_next_sequence_prediction(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_megatron_bert_for_pretraining(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_megatron_bert_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_megatron_bert_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_megatron_bert_for_token_classification(*config_and_inputs)
-
-
-def _long_tensor(tok_lst):
-    return mindspore.tensor(
-        tok_lst,
-        dtype=mindspore.int64,
-    )
-
-
-TOLERANCE = 1e-4
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class MegatronBertModelIntegrationTests(unittest.TestCase):
-    @slow
-    @unittest.skip(reason="Model is not available.")
-    def test_inference_no_head(self):
-        directory = "nvidia/megatron-bert-uncased-345m"
-        if "MYDIR" in os.environ:
-            directory = os.path.join(os.environ["MYDIR"], directory)
-        model = MegatronBertModel.from_pretrained(directory)
-        model.half()
-        input_ids = _long_tensor([[101, 7110, 1005, 1056, 2023, 11333, 17413, 1029, 102]])
-        with no_grad():
-            output = model(input_ids)[0]
-        expected_shape = (1, 9, 1024)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected = [-0.6040, -0.2517, -0.1025, 0.3420, -0.6758, -0.0017, -0.1089, -0.1990, 0.5728]
-        for ii in range(3):
-            for jj in range(3):
-                a = output[0, ii, jj]
-                b = expected[3 * ii + jj]
-                msg = "ii={} jj={} a={} b={}".format(ii, jj, a, b)
-                self.assertTrue(math.isclose(a, b, rel_tol=TOLERANCE, abs_tol=TOLERANCE), msg=msg)
\ No newline at end of file
diff --git a/tests/transformers/models/mgp_str/__init__.py b/tests/transformers/models/mgp_str/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mgp_str/test_modeling_mgp_str.py b/tests/transformers/models/mgp_str/test_modeling_mgp_str.py
deleted file mode 100644
index d685403ac..000000000
--- a/tests/transformers/models/mgp_str/test_modeling_mgp_str.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch MGP-STR model."""
-
-import unittest
-import numpy as np
-import requests
-
-from mindnlp.transformers import MgpstrConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindnlp.core import nn
-    from mindnlp.transformers import MgpstrForSceneTextRecognition, MgpstrModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import MgpstrProcessor
-
-
-class MgpstrModelTester:
-    def __init__(
-        self,
-        parent,
-        is_training=False,
-        batch_size=13,
-        image_size=(32, 128),
-        patch_size=4,
-        num_channels=3,
-        max_token_length=27,
-        num_character_labels=38,
-        num_bpe_labels=99,
-        num_wordpiece_labels=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        mlp_ratio=4.0,
-        patch_embeds_hidden_size=257,
-        output_hidden_states=None,
-    ):
-        self.parent = parent
-        self.is_training = is_training
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.max_token_length = max_token_length
-        self.num_character_labels = num_character_labels
-        self.num_bpe_labels = num_bpe_labels
-        self.num_wordpiece_labels = num_wordpiece_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.mlp_ratio = mlp_ratio
-        self.patch_embeds_hidden_size = patch_embeds_hidden_size
-        self.output_hidden_states = output_hidden_states
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [self.batch_size, self.num_channels, self.image_size[0], self.image_size[1]]
-        )
-        config = self.get_config()
-        return config, pixel_values
-
-    def get_config(self):
-        return MgpstrConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            max_token_length=self.max_token_length,
-            num_character_labels=self.num_character_labels,
-            num_bpe_labels=self.num_bpe_labels,
-            num_wordpiece_labels=self.num_wordpiece_labels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            mlp_ratio=self.mlp_ratio,
-            output_hidden_states=self.output_hidden_states,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = MgpstrForSceneTextRecognition(config)
-        model.set_train(False)
-        generated_ids = model(pixel_values)
-        self.parent.assertEqual(
-            generated_ids[0][0].shape,
-            (self.batch_size, self.max_token_length, self.num_character_labels),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class MgpstrModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (MgpstrForSceneTextRecognition,) if is_mindspore_available() else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": MgpstrForSceneTextRecognition,
-            "image-feature-extraction": MgpstrModel,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = False
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = MgpstrModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=MgpstrConfig, has_text_modality=False
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="MgpstrModel does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    @unittest.skip(reason="MgpstrModel does not support feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip(reason="MgpstrModel does not support gradient_checkpointing")
-    def test_gradient_checkpointing_backward_compatibility(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            # if not model_class.supports_gradient_checkpointing:
-            #     continue
-
-            config.gradient_checkpointing = True
-            model = model_class(config)
-            self.assertTrue(model.is_gradient_checkpointing)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester,
-                "expected_num_hidden_layers",
-                self.model_tester.num_hidden_layers + 1,
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [
-                    self.model_tester.patch_embeds_hidden_size,
-                    self.model_tester.hidden_size,
-                ],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    # override as the `logit_scale` parameter initilization is different for MgpstrModel
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if isinstance(param, (nn.Linear, nn.Conv2d, nn.LayerNorm)):
-                    if param.requires_grad:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-
-# We will verify our results on an image from the IIIT-5k dataset
-def prepare_img():
-    url = "https://i.postimg.cc/ZKwLg2Gw/367-14.png"
-    im = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    return im
-
-
-@require_vision
-@require_mindspore
-class MgpstrModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "alibaba-damo/mgp-str-base"
-        model = MgpstrForSceneTextRecognition.from_pretrained(model_name, from_pt=True)
-        processor = MgpstrProcessor.from_pretrained(model_name, from_pt=True)
-
-        image = prepare_img()
-        inputs = processor(images=image, return_tensors="ms").pixel_values
-
-        # forward pass
-        outputs = model(inputs)
-
-        # verify the logits
-        self.assertEqual(outputs.logits[0].shape, ((1, 27, 38)))
-
-        out_strs = processor.batch_decode(outputs.logits)
-        expected_text = "ticket"
-
-        self.assertEqual(out_strs["generated_text"][0], expected_text)
-
-        expected_slice = ms.tensor(
-            [
-                [
-                    [-39.5397, -44.4024, -36.1844],
-                    [-61.4709, -63.8639, -58.3454],
-                    [-74.0225, -68.5494, -71.2164],
-                ]
-            ],
-        )
-
-        self.assertTrue(
-            np.allclose(
-                outputs.logits[0][:, 1:4, 1:4].asnumpy(),
-                expected_slice.asnumpy(),
-                atol=1e-4,
-            )
-        )
diff --git a/tests/transformers/models/mgp_str/test_processor_mgp_str.py b/tests/transformers/models/mgp_str/test_processor_mgp_str.py
deleted file mode 100644
index 517200fbc..000000000
--- a/tests/transformers/models/mgp_str/test_processor_mgp_str.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MgpstrProcessor."""
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-import pytest
-
-from mindnlp.transformers import MgpstrTokenizer
-from mindnlp.transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES
-from mindnlp.configs import IMAGE_PROCESSOR_NAME
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils import (
-    is_mindspore_available,
-    is_vision_available,
-)
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import MgpstrProcessor, ViTImageProcessor
-
-
-@require_mindspore
-@require_vision
-class MgpstrProcessorTest(unittest.TestCase):
-    image_processing_class = ViTImageProcessor if is_vision_available() else None
-
-    @property
-    def image_processor_dict(self):
-        return self.prepare_image_processor_dict()
-
-    def setUp(self):
-        self.image_size = (3, 32, 128)
-        self.tmpdirname = tempfile.mkdtemp()
-
-        vocab = ['[GO]', '[s]', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']  # fmt: skip
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-
-        image_processor_map = {
-            "do_normalize": False,
-            "do_resize": True,
-            "image_processor_type": "ViTImageProcessor",
-            "resample": 3,
-            "size": {"height": 32, "width": 128},
-        }
-        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
-        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
-            json.dump(image_processor_map, fp)
-
-    def get_tokenizer(self, **kwargs):
-        return MgpstrTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_image_processor(self, **kwargs):
-        return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def prepare_image_inputs(self):
-        """This function prepares a list of PIL images."""
-
-        image_input = np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)
-
-        image_input = Image.fromarray(np.moveaxis(image_input, 0, -1))
-
-        return image_input
-
-    def test_save_load_pretrained_default(self):
-        tokenizer = self.get_tokenizer()
-        image_processor = self.get_image_processor()
-
-        processor = MgpstrProcessor(
-            tokenizer=tokenizer, image_processor=image_processor
-        )
-        processor.save_pretrained(self.tmpdirname)
-        processor = MgpstrProcessor.from_pretrained(self.tmpdirname, use_fast=False)
-
-        self.assertEqual(processor.char_tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertIsInstance(processor.char_tokenizer, MgpstrTokenizer)
-
-        self.assertEqual(
-            processor.image_processor.to_json_string(), image_processor.to_json_string()
-        )
-        self.assertIsInstance(processor.image_processor, ViTImageProcessor)
-
-    def test_save_load_pretrained_additional_features(self):
-        tokenizer = self.get_tokenizer()
-        image_processor = self.get_image_processor()
-
-        processor = MgpstrProcessor(
-            tokenizer=tokenizer, image_processor=image_processor
-        )
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_image_processor(
-            do_normalize=False, padding_value=1.0
-        )
-
-        processor = MgpstrProcessor.from_pretrained(
-            self.tmpdirname,
-            bos_token="(BOS)",
-            eos_token="(EOS)",
-            do_normalize=False,
-            padding_value=1.0,
-        )
-
-        self.assertEqual(
-            processor.char_tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()
-        )
-        self.assertIsInstance(processor.char_tokenizer, MgpstrTokenizer)
-
-        self.assertEqual(
-            processor.image_processor.to_json_string(),
-            image_processor_add_kwargs.to_json_string(),
-        )
-        self.assertIsInstance(processor.image_processor, ViTImageProcessor)
-
-    def test_image_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MgpstrProcessor(
-            tokenizer=tokenizer, image_processor=image_processor
-        )
-
-        image_input = self.prepare_image_inputs()
-
-        input_image_proc = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
-
-        for key in input_image_proc.keys():
-            self.assertAlmostEqual(
-                input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2
-            )
-
-    def test_tokenizer(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MgpstrProcessor(
-            tokenizer=tokenizer, image_processor=image_processor
-        )
-
-        input_str = "test"
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tok = tokenizer(input_str)
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MgpstrProcessor(
-            tokenizer=tokenizer, image_processor=image_processor
-        )
-
-        input_str = "test"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertListEqual(list(inputs.keys()), ["pixel_values", "labels"])
-
-        # test if it raises when no input is passed
-        with pytest.raises(ValueError):
-            processor()
-
-    def test_tokenizer_decode(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MgpstrProcessor(
-            tokenizer=tokenizer, image_processor=image_processor
-        )
-
-        predicted_ids = [
-            [1, 4, 5, 8, 1, 0, 8],
-            [3, 4, 3, 1, 1, 8, 9],
-            [3, 4, 3, 1, 1, 8, 9],
-        ]
-
-        decoded_processor = processor.char_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-        decode_strs = [seq.replace(" ", "") for seq in decoded_tok]
-
-        self.assertListEqual(decode_strs, decoded_processor)
-
-    def test_model_input_names(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MgpstrProcessor(
-            tokenizer=tokenizer, image_processor=image_processor
-        )
-
-        input_str = None
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertListEqual(list(inputs.keys()), processor.model_input_names)
-
-    def test_processor_batch_decode(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = MgpstrProcessor(
-            tokenizer=tokenizer, image_processor=image_processor
-        )
-
-        char_input = ops.randn(1, 27, 38)
-        bpe_input = ops.randn(1, 27, 50257)
-        wp_input = ops.randn(1, 27, 30522)
-
-        results = processor.batch_decode([char_input, bpe_input, wp_input])
-
-        self.assertListEqual(
-            list(results.keys()),
-            ["generated_text", "scores", "char_preds", "bpe_preds", "wp_preds"],
-        )
diff --git a/tests/transformers/models/mistral/__init__.py b/tests/transformers/models/mistral/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mistral/test_modeling_mistral.py b/tests/transformers/models/mistral/test_modeling_mistral.py
deleted file mode 100644
index bd58e85ec..000000000
--- a/tests/transformers/models/mistral/test_modeling_mistral.py
+++ /dev/null
@@ -1,426 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch Mistral model. """
-
-
-import gc
-import tempfile
-import unittest
-
-import pytest
-
-import numpy as np
-
-from mindnlp.transformers import AutoTokenizer, MistralConfig
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        MistralForCausalLM,
-        MistralForSequenceClassification,
-        MistralModel,
-    )
-
-class MistralModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        num_key_value_heads=2,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ops.tril(ops.ones((self.batch_size, self.seq_length)))
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return MistralConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_key_value_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Mistral
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MistralModel(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Mistral
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = MistralModel(config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Mistral
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = MistralForCausalLM(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Mistral
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = MistralForCausalLM(config=config)
-
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (MistralModel, MistralForCausalLM, MistralForSequenceClassification) if is_mindspore_available() else ()
-    )
-    all_generative_model_classes = (MistralForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": MistralModel,
-            "text-classification": MistralForSequenceClassification,
-            "text-generation": MistralForCausalLM,
-            "zero-shot": MistralForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-
-    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return True
-
-    def setUp(self):
-        self.model_tester = MistralModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MistralConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_Mistral_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        print(config)
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = MistralForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Mistral_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = MistralForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Mistral_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(mindspore.float32)
-        model = MistralForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @unittest.skip("Mistral buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip("Mistral uses GQA on all models so the KV cache is a non standard format")
-    def test_past_key_values_format(self):
-        pass
-
-
-@require_mindspore
-class MistralIntegrationTest(unittest.TestCase):
-    @slow
-    def test_model_7b_logits(self):
-        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
-        model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
-        input_ids = mindspore.tensor([input_ids])
-        out = model(input_ids).logits
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = mindspore.tensor([[-2.5548, -2.5737, -3.0600, -2.5906, -2.8478, -2.8118, -2.9325, -2.7694]])
-        assert np.allclose(out.mean(-1).asnumpy(), EXPECTED_MEAN.asnumpy(), atol=1e-2, rtol=1e-2)
-        # slicing logits[0, 0, 0:30]
-        EXPECTED_SLICE = mindspore.tensor([-5.8781, -5.8616, -0.1052, -4.7200, -5.8781, -5.8774, -5.8773, -5.8777, -5.8781, -5.8780, -5.8781, -5.8779, -1.0787,  1.7583, -5.8779, -5.8780, -5.8783, -5.8778, -5.8776, -5.8781, -5.8784, -5.8778, -5.8778, -5.8777, -5.8779, -5.8778, -5.8776, -5.8780, -5.8779, -5.8781])  # fmt: skip
-        print(out[0, 0, :30])
-        assert np.allclose(out[0, 0, :30].asnumpy(), EXPECTED_SLICE.asnumpy(), atol=1e-4, rtol=1e-4)
-
-
-    @slow
-    def test_model_7b_generation(self):
-        EXPECTED_TEXT_COMPLETION = """My favourite condiment is 100% ketchup. I love it on everything. I’m not a big"""
-        prompt = "My favourite condiment is "
-        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False)
-        model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
-        input_ids = tokenizer.encode(prompt, return_tensors="ms")
-
-        # greedy generation outputs
-        generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0)
-        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
-
-    @unittest.skip('not support flash attention yet')
-    @slow
-    def test_model_7b_long_prompt(self):
-        EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
-        # An input with 4097 tokens that is above the size of the sliding window
-        input_ids = [1] + [306, 338] * 2048
-        model = MistralForCausalLM.from_pretrained(
-            "mistralai/Mistral-7B-v0.1",
-            device_map="auto",
-            load_in_4bit=True,
-            attn_implementation="flash_attention_2",
-        )
-        input_ids = mindspore.tensor([input_ids])
-        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
-        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
-
-        # Assisted generation
-        assistant_model = model
-        assistant_model.generation_config.num_assistant_tokens = 2
-        assistant_model.generation_config.num_assistant_tokens_schedule = "constant"
-        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
-        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
diff --git a/tests/transformers/models/mixtral/__init__.py b/tests/transformers/models/mixtral/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mixtral/test_modeling_mixtral.py b/tests/transformers/models/mixtral/test_modeling_mixtral.py
deleted file mode 100644
index 03257d83a..000000000
--- a/tests/transformers/models/mixtral/test_modeling_mixtral.py
+++ /dev/null
@@ -1,459 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore Mixtral model. """
-
-
-import unittest
-import numpy as np
-
-from mindnlp.transformers import MixtralConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-    is_mindspore_available
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import MixtralForCausalLM, MixtralForSequenceClassification, MixtralModel
-
-
-class MixtralModelTester:
-    # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTester.__init__
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        num_key_value_heads=2,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-
-    # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTester.prepare_config_and_inputs
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ops.tril(ops.ones(self.batch_size, self.seq_length))
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return MixtralConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_key_value_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-            num_experts_per_tok=2,
-            num_local_experts=2,
-        )
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Mixtral
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MixtralModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Mixtral
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = MixtralModel(config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Mixtral
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = MixtralForCausalLM(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Mixtral
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = MixtralForCausalLM(config=config)
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Mixtral
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-# Copied from tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Mixtral
-class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (MixtralModel, MixtralForCausalLM, MixtralForSequenceClassification) if is_mindspore_available() else ()
-    )
-    all_generative_model_classes = (MixtralForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": MixtralModel,
-            "text-classification": MixtralForSequenceClassification,
-            "text-generation": MixtralForCausalLM,
-            "zero-shot": MixtralForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-
-    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return True
-
-    def setUp(self):
-        self.model_tester = MixtralModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MixtralConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_Mixtral_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        print(config)
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = MixtralForSequenceClassification(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Mixtral_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = MixtralForSequenceClassification(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Mixtral_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(mindspore.float32)
-        model = MixtralForSequenceClassification(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @unittest.skip("Mixtral buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip("Mixtral uses GQA on all models so the KV cache is a non standard format")
-    def test_past_key_values_format(self):
-        pass
-
-    # Ignore copy
-    def test_load_balancing_loss(self):
-        r"""
-        Let's make sure we can actually compute the loss and do a backward on it.
-        """
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.num_local_experts = 8
-        config.output_router_logits = True
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = MixtralForCausalLM(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask)
-        self.assertEqual(result.router_logits[0].shape, (91, config.num_local_experts))
-        self.assertTrue(np.allclose(result.aux_loss.asnumpy(), np.array(2), rtol=1e-2, atol=1e-2))
-
-        # First, we make sure that adding padding tokens doesn't change the loss
-        # loss(input_ids, attention_mask=None) == loss(input_ids + padding, attention_mask=attention_mask_with_padding)
-        pad_length = 1000
-        # Add padding tokens (assume that pad_token_id=1) to input_ids
-        padding_block = ops.ones(input_ids.shape[0], pad_length, dtype=mindspore.int64)
-        padded_input_ids = ops.cat((padding_block, input_ids), dim=1)  # this is to simulate padding to the left
-        padded_attention_mask = padded_input_ids.ne(1)
-
-        padded_result = model(padded_input_ids, attention_mask=padded_attention_mask)
-        self.assertTrue(np.allclose(result.aux_loss.asnumpy(), padded_result.aux_loss.asnumpy(), rtol=1e-4, atol=1e-4))
-
-        # We make sure that the loss of includding padding tokens != the loss without padding tokens
-        # if attention_mask=None --> we don't exclude padding tokens
-        include_padding_result = model(padded_input_ids, attention_mask=None)
-
-        # This is to mimic torch.testing.assert_not_close
-        self.assertNotAlmostEqual(include_padding_result.aux_loss.item(), result.aux_loss.item())
-
-
-@require_mindspore
-class MixtralIntegrationTest(unittest.TestCase):
-    @slow
-    @require_mindspore
-    def test_small_model_logits(self):
-        model_id = "hf-internal-testing/Mixtral-tiny"
-        dummy_input = mindspore.Tensor([[0, 1, 0], [0, 1, 0]])
-
-        model = MixtralForCausalLM.from_pretrained(model_id, ms_dtype=mindspore.float16)
-        # TODO: might need to tweak it in case the logits do not match on our daily runners
-        # these logits have been obtained with the original megablocks impelmentation.
-        EXPECTED_LOGITS = mindspore.Tensor(
-            [[0.1670, 0.1620, 0.6094], [-0.8906, -0.1588, -0.6060], [0.1572, 0.1290, 0.7246]]
-        )
-
-        logits = model(dummy_input).logits
-
-        self.assertTrue(np.allclose(logits[0, :3, :3].half().asnumpy(), EXPECTED_LOGITS.asnumpy(), atol=1e-3, rtol=1e-3))
-        self.assertTrue(np.allclose(logits[1, :3, :3].half().asnumpy(), EXPECTED_LOGITS.asnumpy(), atol=1e-3, rtol=1e-3))
-
-    @slow
-    @require_mindspore
-    def test_small_model_logits_batched(self):
-        model_id = "hf-internal-testing/Mixtral-tiny"
-        dummy_input = mindspore.Tensor([[0, 0, 0, 0, 0, 0, 1, 2, 3], [1, 1, 2, 3, 4, 5, 6, 7, 8]])
-        attention_mask = dummy_input.ne(0).to(mindspore.int64)
-
-        model = MixtralForCausalLM.from_pretrained(model_id, ms_dtype=mindspore.float16)
-        # TODO: might need to tweak it in case the logits do not match on our daily runners
-        EXPECTED_LOGITS_LEFT = mindspore.Tensor(
-            [[0.1750, 0.0537, 0.7007], [0.1750, 0.0537, 0.7007], [0.1750, 0.0537, 0.7007]],
-        )
-
-        # logits[0, -3:, -3:].half()
-        EXPECTED_LOGITS_LEFT_UNPADDED = mindspore.Tensor(
-            [[0.2212, 0.5200, -0.3816], [0.8213, -0.2313, 0.6069], [0.2664, -0.7090, 0.2468]],
-        )
-
-        # logits[1, -3:, -3:].half()
-        EXPECTED_LOGITS_RIGHT_UNPADDED = mindspore.Tensor(
-            [[0.2205, 0.1232, -0.1611], [-0.3484, 0.3030, -1.0312], [0.0742, 0.7930, 0.7969]]
-        )
-
-        logits = model(dummy_input, attention_mask=attention_mask).logits
-
-        print(logits[0, :3, :3].half().asnumpy(), EXPECTED_LOGITS_LEFT.asnumpy())
-        self.assertTrue(np.allclose(logits[0, :3, :3].half().asnumpy(), EXPECTED_LOGITS_LEFT.asnumpy(), atol=1e-3, rtol=1e-3))
-        self.assertTrue(np.allclose(logits[0, -3:, -3:].half().asnumpy(), EXPECTED_LOGITS_LEFT_UNPADDED.asnumpy(), atol=1e-3, rtol=1e-3))
-        self.assertTrue(np.allclose(logits[1, -3:, -3:].half().asnumpy(), EXPECTED_LOGITS_RIGHT_UNPADDED.asnumpy(), atol=1e-3, rtol=1e-3))
-
-    @slow
-    @require_mindspore
-    def test_small_model_generate_time(self):
-        model_id = "hf-internal-testing/Mixtral-tiny"
-        dummy_input = mindspore.Tensor([[0, 1, 0], [0, 1, 0]])
-
-        model = MixtralForCausalLM.from_pretrained(model_id, ms_dtype=mindspore.float16)
-        # TODO: might need to tweak it in case the logits do not match on our daily runners
-        # these logits have been obtained with the original megablocks impelmentation.
-        model.generate(dummy_input, max_new_tokens=20)
diff --git a/tests/transformers/models/mllama/__init__.py b/tests/transformers/models/mllama/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mllama/test_image_processing_mllama.py b/tests/transformers/models/mllama/test_image_processing_mllama.py
deleted file mode 100644
index d1bce2cd8..000000000
--- a/tests/transformers/models/mllama/test_image_processing_mllama.py
+++ /dev/null
@@ -1,356 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import MllamaImageProcessor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-
-class MllamaImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        num_images=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_convert_rgb=True,
-        do_pad=True,
-        max_image_tiles=4,
-    ):
-        super().__init__()
-        size = size if size is not None else {"height": 224, "width": 224}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.max_image_tiles = max_image_tiles
-        self.image_size = image_size
-        self.num_images = num_images
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_convert_rgb = do_convert_rgb
-        self.do_pad = do_pad
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_convert_rgb": self.do_convert_rgb,
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_pad": self.do_pad,
-            "max_image_tiles": self.max_image_tiles,
-        }
-
-    def prepare_image_inputs(
-        self,
-        batch_size=None,
-        min_resolution=None,
-        max_resolution=None,
-        num_channels=None,
-        num_images=None,
-        size_divisor=None,
-        equal_resolution=False,
-        numpify=False,
-        msify=False,
-    ):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies msify=True.
-
-        One can specify whether the images are of the same resolution or not.
-        """
-        assert not (numpify and msify), "You cannot specify both numpy and PyTorch tensors at the same time"
-
-        batch_size = batch_size if batch_size is not None else self.batch_size
-        min_resolution = min_resolution if min_resolution is not None else self.min_resolution
-        max_resolution = max_resolution if max_resolution is not None else self.max_resolution
-        num_channels = num_channels if num_channels is not None else self.num_channels
-        num_images = num_images if num_images is not None else self.num_images
-
-        images_list = []
-        for i in range(batch_size):
-            images = []
-            for j in range(num_images):
-                if equal_resolution:
-                    width = height = max_resolution
-                else:
-                    # To avoid getting image width/height 0
-                    if size_divisor is not None:
-                        # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
-                        min_resolution = max(size_divisor, min_resolution)
-                    width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
-                images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
-            images_list.append(images)
-
-        if not numpify and not msify:
-            # PIL expects the channel dimension as last dimension
-            images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list]
-
-        if msify:
-            images_list = [[ops.from_numpy(image) for image in images] for images in images_list]
-
-        return images_list
-
-    def expected_output_image_shape(self, images):
-        expected_output_image_shape = (
-            max(len(images) for images in images),
-            self.max_image_tiles,
-            self.num_channels,
-            self.size["height"],
-            self.size["width"],
-        )
-        return expected_output_image_shape
-
-
-@require_mindspore
-@require_vision
-class MllamaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = MllamaImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = MllamaImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_rescale"))
-        self.assertTrue(hasattr(image_processing, "rescale_factor"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_pad"))
-        self.assertTrue(hasattr(image_processing, "max_image_tiles"))
-
-    def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-        for sample_images in image_inputs:
-            for image in sample_images:
-                self.assertIsInstance(image, np.ndarray)
-
-        expected_output_image_shape = (
-            max(len(images) for images in image_inputs),
-            self.image_processor_tester.max_image_tiles,
-            self.image_processor_tester.num_channels,
-            self.image_processor_tester.size["height"],
-            self.image_processor_tester.size["width"],
-        )
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-        self.assertEqual(
-            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
-        )
-
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        for images in image_inputs:
-            for image in images:
-                self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-        self.assertEqual(
-            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
-        )
-
-    def test_call_mindspore(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, msify=True)
-
-        for images in image_inputs:
-            for image in images:
-                self.assertIsInstance(image, mindspore.Tensor)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-        # Test batched
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
-        self.assertEqual(
-            tuple(encoded_images.shape),
-            (self.image_processor_tester.batch_size, *expected_output_image_shape),
-        )
-
-    def test_call_numpy_4_channels(self):
-        self.skipTest("4 channels input is not supported yet")
-
-    def test_image_correctly_tiled(self):
-        def get_empty_tiles(pixel_values):
-            # image has shape batch_size, max_num_images, max_image_tiles, num_channels, height, width
-            # we want to get a binary mask of shape batch_size, max_num_images, max_image_tiles
-            # of empty tiles, i.e. tiles that are completely zero
-            return np.all(pixel_values == 0, axis=(3, 4, 5))
-
-        image_processor_dict = {**self.image_processor_dict, "size": {"height": 50, "width": 50}, "max_image_tiles": 4}
-        image_processor = self.image_processing_class(**image_processor_dict)
-
-        # image fits 2x2 tiles grid (width x height)
-        image = Image.new("RGB", (80, 95))
-        inputs = image_processor(image, return_tensors="np")
-        pixel_values = inputs.pixel_values
-        empty_tiles = get_empty_tiles(pixel_values)[0, 0].tolist()
-        self.assertEqual(empty_tiles, [False, False, False, False])
-        aspect_ratio_ids = inputs.aspect_ratio_ids[0, 0]
-        self.assertEqual(aspect_ratio_ids, 6)
-        aspect_ratio_mask = inputs.aspect_ratio_mask[0, 0].tolist()
-        self.assertEqual(aspect_ratio_mask, [1, 1, 1, 1])
-
-        # image fits 3x1 grid (width x height)
-        image = Image.new("RGB", (101, 50))
-        inputs = image_processor(image, return_tensors="np")
-        pixel_values = inputs.pixel_values
-        empty_tiles = get_empty_tiles(pixel_values)[0, 0].tolist()
-        self.assertEqual(empty_tiles, [False, False, False, True])
-        aspect_ratio_ids = inputs.aspect_ratio_ids[0, 0]
-        self.assertEqual(aspect_ratio_ids, 3)
-        num_tiles = inputs.aspect_ratio_mask[0, 0].sum()
-        self.assertEqual(num_tiles, 3)
-        aspect_ratio_mask = inputs.aspect_ratio_mask[0, 0].tolist()
-        self.assertEqual(aspect_ratio_mask, [1, 1, 1, 0])
-
-        # image fits 1x1 grid (width x height)
-        image = Image.new("RGB", (20, 39))
-        inputs = image_processor(image, return_tensors="np")
-        pixel_values = inputs.pixel_values
-        empty_tiles = get_empty_tiles(pixel_values)[0, 0].tolist()
-        self.assertEqual(empty_tiles, [False, True, True, True])
-        aspect_ratio_ids = inputs.aspect_ratio_ids[0, 0]
-        self.assertEqual(aspect_ratio_ids, 1)
-        aspect_ratio_mask = inputs.aspect_ratio_mask[0, 0].tolist()
-        self.assertEqual(aspect_ratio_mask, [1, 0, 0, 0])
-
-        # image fits 2x1 grid (width x height)
-        image = Image.new("RGB", (51, 20))
-        inputs = image_processor(image, return_tensors="np")
-        pixel_values = inputs.pixel_values
-        empty_tiles = get_empty_tiles(pixel_values)[0, 0].tolist()
-        self.assertEqual(empty_tiles, [False, False, True, True])
-        aspect_ratio_ids = inputs.aspect_ratio_ids[0, 0]
-        self.assertEqual(aspect_ratio_ids, 2)
-        aspect_ratio_mask = inputs.aspect_ratio_mask[0, 0].tolist()
-        self.assertEqual(aspect_ratio_mask, [1, 1, 0, 0])
-
-        # image is greater than 2x2 tiles grid (width x height)
-        image = Image.new("RGB", (150, 150))
-        inputs = image_processor(image, return_tensors="np")
-        pixel_values = inputs.pixel_values
-        empty_tiles = get_empty_tiles(pixel_values)[0, 0].tolist()
-        self.assertEqual(empty_tiles, [False, False, False, False])
-        aspect_ratio_ids = inputs.aspect_ratio_ids[0, 0]
-        self.assertEqual(aspect_ratio_ids, 6)  # (2 - 1) * 4 + 2 = 6
-        aspect_ratio_mask = inputs.aspect_ratio_mask[0, 0].tolist()
-        self.assertEqual(aspect_ratio_mask, [1, 1, 1, 1])
-
-        # batch of images
-        image1 = Image.new("RGB", (80, 95))
-        image2 = Image.new("RGB", (101, 50))
-        image3 = Image.new("RGB", (23, 49))
-        inputs = image_processor([[image1], [image2, image3]], return_tensors="np")
-        pixel_values = inputs.pixel_values
-        empty_tiles = get_empty_tiles(pixel_values).tolist()
-        expected_empty_tiles = [
-            # sample 1 with 1 image 2x2 grid
-            [
-                [False, False, False, False],
-                [True, True, True, True],  # padding
-            ],
-            # sample 2
-            [
-                [False, False, False, True],  # 3x1
-                [False, True, True, True],  # 1x1
-            ],
-        ]
-        self.assertEqual(empty_tiles, expected_empty_tiles)
-        aspect_ratio_ids = inputs.aspect_ratio_ids.tolist()
-        expected_aspect_ratio_ids = [[6, 0], [3, 1]]
-        self.assertEqual(aspect_ratio_ids, expected_aspect_ratio_ids)
-        aspect_ratio_mask = inputs.aspect_ratio_mask.tolist()
-        expected_aspect_ratio_mask = [
-            [
-                [1, 1, 1, 1],
-                [1, 0, 0, 0],
-            ],
-            [
-                [1, 1, 1, 0],
-                [1, 0, 0, 0],
-            ],
-        ]
-        self.assertEqual(aspect_ratio_mask, expected_aspect_ratio_mask)
\ No newline at end of file
diff --git a/tests/transformers/models/mllama/test_modeling_mllama.py b/tests/transformers/models/mllama/test_modeling_mllama.py
deleted file mode 100644
index b64228066..000000000
--- a/tests/transformers/models/mllama/test_modeling_mllama.py
+++ /dev/null
@@ -1,578 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Mllama model."""
-
-import gc
-import unittest
-
-import requests
-
-from mindnlp.transformers import (
-    AutoProcessor,
-    # BitsAndBytesConfig,
-    MllamaConfig,
-    MllamaForCausalLM,
-    MllamaForConditionalGeneration,
-)
-from mindnlp.utils import (
-    is_mindspore_available,
-    is_vision_available,
-)
-from mindnlp.transformers.models.mllama.configuration_mllama import MllamaTextConfig
-from mindnlp.utils.testing_utils import (
-    is_flaky,
-    # require_bitsandbytes,
-    require_mindspore,
-    require_mindspore_gpu,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-if is_vision_available():
-    from PIL import Image
-
-
-class MllamaText2TextModelTester:
-    def __init__(
-        self,
-        parent,
-        ignore_index=-100,
-        seq_length=7,
-        is_training=True,
-        text_config={
-            "model_type": "mllama",
-            "vocab_size": 99,
-            "hidden_size": 32,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "num_key_value_heads": 4,
-            "intermediate_size": 37,
-            "hidden_act": "gelu",
-            "max_position_embeddings": 512,
-            "initializer_range": 0.02,
-            "rope_scaling": {"rope_type": "default"},
-            "pad_token_id": 0,
-            "bos_token_id": 1,
-            "eos_token_id": 2,
-        },
-    ):
-        self.parent = parent
-        self.ignore_index = ignore_index
-        self.text_config = text_config
-        self.seq_length = seq_length
-
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.vocab_size = text_config["vocab_size"]
-        self.hidden_size = text_config["hidden_size"]
-        self.num_attention_heads = text_config["num_attention_heads"]
-        self.is_training = is_training
-        self.pad_token_id = self.text_config["pad_token_id"]
-        self.batch_size = 3
-
-    def get_config(self):
-        return MllamaTextConfig(**self.text_config)
-
-    def prepare_config_and_inputs(self):
-        config = self.get_config()
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.vocab_size - 1) + 1
-        attention_mask = input_ids.ne(1)
-        return config, input_ids, attention_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_ids, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-    def create_and_check_mllama_model_fp16_forward(self, config, input_ids, attention_mask):
-        model = MllamaForCausalLM(config=config)
-        model.eval()
-        # with torch.autocast(device_type="cuda", dtype=torch.float16):
-        logits = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            return_dict=True,
-        )["logits"]
-        self.parent.assertFalse(ops.isnan(logits).any().item())
-
-
-@require_mindspore
-class MllamaForCausalLMModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    """
-    Model tester for `MllamaForConditionalGeneration`.
-    """
-
-    all_model_classes = (MllamaForCausalLM,) if is_mindspore_available() else ()
-    all_generative_model_classes = (MllamaForCausalLM,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_head_masking = False
-    _torch_compile_test_ckpt = "nltpt/Llama-3.2-11B-Vision"
-
-    def setUp(self):
-        self.model_tester = MllamaText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MllamaTextConfig, has_text_modality=True)
-
-
-class MllamaVisionText2TextModelTester:
-    def __init__(
-        self,
-        parent,
-        ignore_index=-100,
-        image_token_index=4,
-        seq_length=7,
-        is_training=True,
-        text_config={
-            "model_type": "mllama",
-            "vocab_size": 99,
-            "hidden_size": 32,
-            "num_hidden_layers": 4,
-            "num_attention_heads": 4,
-            "num_key_value_heads": 4,
-            "intermediate_size": 37,
-            "hidden_act": "gelu",
-            "max_position_embeddings": 512,
-            "initializer_range": 0.02,
-            "rope_scaling": {"rope_type": "default"},
-            "pad_token_id": 0,
-            "bos_token_id": 1,
-            "eos_token_id": 2,
-            "cross_attention_layers": [1],
-        },
-        vision_config={
-            "image_size": 30,
-            "patch_size": 2,
-            "num_channels": 3,
-            "hidden_size": 16,
-            "intermediate_layers_indices": [0],
-            "vision_output_dim": 32,
-            "projection_dim": 32,
-            "num_hidden_layers": 6,
-            "num_global_layers": 2,
-            "num_attention_heads": 4,
-            "intermediate_size": 37,
-            "dropout": 0.1,
-            "initializer_range": 0.02,
-            "supported_aspect_ratios": [[1, 1], [1, 2], [1, 3], [1, 4], [2, 1], [2, 2], [3, 1], [4, 1]],
-        },
-    ):
-        self.parent = parent
-        self.is_training = is_training
-        self.ignore_index = ignore_index
-        self.image_token_index = image_token_index
-        self.text_config = text_config
-        self.vision_config = vision_config
-        self.seq_length = seq_length
-
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.vocab_size = text_config["vocab_size"]
-        self.hidden_size = text_config["hidden_size"]
-        self.num_attention_heads = text_config["num_attention_heads"]
-        self.pad_token_id = self.text_config["pad_token_id"]
-
-        self.batch_size = 3
-        self.num_channels = 3
-        self.image_size = 224
-        self.max_num_images = 1
-        self.max_image_tiles = 4
-        self.image_length = 904
-
-    def get_config(self):
-        return MllamaConfig(
-            text_config=self.text_config,
-            vision_config=self.vision_config,
-            image_token_index=self.image_token_index,
-        )
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [
-                self.batch_size,
-                self.max_num_images,
-                self.max_image_tiles,
-                self.vision_config["num_channels"],
-                self.vision_config["image_size"],
-                self.vision_config["image_size"],
-            ]
-        )
-        aspect_ratio_ids = mindspore.tensor([[6] * self.batch_size]).swapaxes(0, 1)
-        aspect_ratio_mask = ops.ones(self.batch_size, self.max_num_images, self.max_image_tiles)
-        config = self.get_config()
-
-        return config, pixel_values, aspect_ratio_ids, aspect_ratio_mask
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, aspect_ratio_ids, aspect_ratio_mask = config_and_inputs
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
-        attention_mask = input_ids.ne(1)
-        aspect_ratio_mask = aspect_ratio_mask
-        cross_attention_mask = ops.ones(
-            (self.batch_size, self.seq_length, self.max_num_images, self.max_image_tiles)
-        )
-
-        input_ids[input_ids == config.image_token_index] = self.pad_token_id
-        input_ids[:, 1] = config.image_token_index
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "aspect_ratio_ids": aspect_ratio_ids,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "aspect_ratio_mask": aspect_ratio_mask,
-            "cross_attention_mask": cross_attention_mask,
-            "use_cache": True,
-        }
-        return config, inputs_dict
-
-    def create_and_check_mllama_model_fp16_forward(self, config, input_ids, pixel_values, attention_mask):
-        model = MllamaForConditionalGeneration(config=config)
-        model.eval()
-        # with torch.autocast(device_type="cuda", dtype=torch.float16):
-        logits = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            pixel_values=pixel_values.to(mindspore.bfloat16),
-            return_dict=True,
-        )["logits"]
-        self.parent.assertFalse(ops.isnan(logits).any().item())
-
-
-@require_mindspore
-class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    """
-    Model tester for `MllamaForConditionalGeneration`.
-    """
-
-    all_model_classes = (MllamaForConditionalGeneration,) if is_mindspore_available() else ()
-    all_generative_model_classes = (MllamaForConditionalGeneration,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_head_masking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = MllamaVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MllamaConfig, has_text_modality=False)
-
-    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            wte = model.get_input_embeddings()
-            inputs["inputs_embeds"] = wte(input_ids)
-
-            with no_grad():
-                model(**inputs)
-
-    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
-    # while some other models require pixel_values to be present
-    def test_inputs_embeds_matches_input_ids(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-
-            with no_grad():
-                out_ids = model(input_ids=input_ids, **inputs)[0]
-                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
-            self.assertTrue(ops.allclose(out_embeds, out_ids))
-
-    def _check_attentions_for_generate(
-        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        # Mllama has cross attention layers and those have a different shape than normal attention layers
-        self.assertIsInstance(attentions, tuple)
-        self.assertListEqual(
-            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
-        )
-        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
-        cross_attention_layers = self.model_tester.text_config["cross_attention_layers"]
-        for idx, iter_attentions in enumerate(attentions):
-            tgt_len = min_length + idx if not use_cache else 1
-            src_len = min_length + idx
-            expected_shape = (
-                batch_size * num_beam_groups,
-                config.num_attention_heads,
-                tgt_len,
-                src_len,
-            )
-            expected_shape_cross = (
-                batch_size * num_beam_groups,
-                config.num_attention_heads,
-                tgt_len,
-                self.model_tester.image_length,
-            )
-            expected_shapes = [
-                expected_shape if layer_idx not in cross_attention_layers else expected_shape_cross
-                for layer_idx in range(len(iter_attentions))
-            ]
-            self.assertListEqual([layer_attention.shape for layer_attention in iter_attentions], expected_shapes)
-
-
-    @unittest.skip(reason="The test itself is broken")  # TODO @zucchini-nlp
-    def test_generate_with_quant_cache(self):
-        pass
-
-    @unittest.skip(reason="The test itself is broken")  # TODO @zucchini-nlp
-    def test_beam_search_low_memory(self):
-        pass
-
-    @unittest.skip(reason="AssertionError: Items in the second set but not the first: might be a setting issue")
-    def test_model_parallelism(self):
-        pass
-
-
-@require_mindspore
-class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
-    def setUp(self):
-        self.base_model_checkpoint = "meta-llama/Llama-3.2-11B-Vision"
-        self.instruct_model_checkpoint = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-
-    def tearDown(self):
-        gc.collect()
-        try:
-            mindspore.hal.empty_cache()
-        except:
-            pass
-
-    # @slow
-    # @require_mindspore_gpu
-    # @require_bitsandbytes
-    # def test_11b_model_integration_generate(self):
-    #     # Prepare inputs
-    #     processor = AutoProcessor.from_pretrained(self.base_model_checkpoint)
-
-    #     prompt = "<|image|>If I had to write a haiku for this one"
-    #     url = "https://llava-vl.github.io/static/images/view.jpg"
-    #     image = Image.open(requests.get(url, stream=True).raw)
-
-    #     inputs = processor(text=prompt, images=image, return_tensors="ms")
-
-    #     # Check inputs ids
-    #     expected_input_ids = torch.tensor([[128256, 128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342, 369, 420, 832]])  # fmt: skip
-    #     self.assertTrue(torch.equal(inputs["input_ids"], expected_input_ids))
-
-    #     # Load model in 4 bit
-    #     quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-    #     model = MllamaForConditionalGeneration.from_pretrained(
-    #         self.base_model_checkpoint, quantization_config=quantization_config
-    #     )
-
-    #     # Generate
-    #     output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
-
-    #     decoded_output = processor.decode(output[0], skip_special_tokens=True)
-    #     expected_output = "If I had to write a haiku for this one, it would be:.\\nI'm not a poet.\\nBut I'm a photographer.\\nAnd I'm a"  # fmt: skip
-
-    #     self.assertEqual(
-    #         decoded_output,
-    #         expected_output,
-    #         f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
-    #     )
-
-    # @slow
-    # @require_mindspore_gpu
-    # @require_bitsandbytes
-    # @require_read_token
-    # def test_11b_model_integration_generate_text_only(self):
-    #     # Prepare inputs
-    #     processor = AutoProcessor.from_pretrained(self.base_model_checkpoint)
-    #     prompt = "If I had to write a haiku"
-    #     inputs = processor(text=prompt, return_tensors="ms")
-
-    #     # Check inputs ids
-    #     expected_input_ids = [128000, 2746, 358, 1047, 311, 3350, 264, 6520, 39342]
-    #     self.assertEqual(inputs["input_ids"].cpu().squeeze().tolist(), expected_input_ids)
-
-    #     # Load model in 4 bit
-    #     quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-    #     model = MllamaForConditionalGeneration.from_pretrained(
-    #         self.base_model_checkpoint, quantization_config=quantization_config
-    #     )
-
-    #     # Generate
-    #     output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
-
-    #     decoded_output = processor.decode(output[0], skip_special_tokens=True)
-    #     expected_output = "If I had to write a haiku about my life, I think it would be something like:\n\"Life is a messy stream\nTwists and turns, ups"  # fmt: skip
-
-    #     self.assertEqual(
-    #         decoded_output,
-    #         expected_output,
-    #         f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
-    #     )
-
-    # @slow
-    # @require_mindspore_gpu
-    # @require_bitsandbytes
-    # @require_read_token
-    # def test_11b_model_integration_forward(self):
-    #     # Prepare inputs
-    #     processor = AutoProcessor.from_pretrained(self.base_model_checkpoint)
-
-    #     prompt = "<|image|>If I had to write a haiku for this one"
-    #     url = "https://llava-vl.github.io/static/images/view.jpg"
-    #     image = Image.open(requests.get(url, stream=True).raw)
-
-    #     inputs = processor(text=prompt, images=image, return_tensors="ms")
-
-    #     # Load model in 4 bit
-    #     quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-    #     model = MllamaForConditionalGeneration.from_pretrained(
-    #         self.base_model_checkpoint, quantization_config=quantization_config
-    #     )
-
-    #     # Forward
-    #     with torch.inference_mode():
-    #         output = model(**inputs)
-
-    #     actual_logits = output.logits[0, -1, :5].cpu()
-    #     expected_logits = torch.tensor([8.3594, 7.7148, 4.7266, 0.7803, 3.1504])
-    #     self.assertTrue(
-    #         torch.allclose(actual_logits, expected_logits, atol=0.1),
-    #         f"Actual logits: {actual_logits}"
-    #         f"\nExpected logits: {expected_logits}"
-    #         f"\nDifference: {torch.abs(actual_logits - expected_logits)}",
-    #     )
-
-    # @slow
-    # @require_mindspore_gpu
-    # @require_bitsandbytes
-    # @require_read_token
-    # def test_11b_model_integration_batched_generate(self):
-    #     processor = AutoProcessor.from_pretrained(self.base_model_checkpoint)
-
-    #     # Prepare inputs
-    #     prompt = [
-    #         "<|image|>If I had to write a haiku for this one",
-    #         "<|image|>This image shows",
-    #     ]
-    #     image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
-    #     image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
-
-    #     inputs = processor(text=prompt, images=[[image1], [image2]], padding=True, return_tensors="ms").to(
-    #         torch_device
-    #     )
-
-    #     # Load model in 4 bit
-    #     quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-    #     model = MllamaForConditionalGeneration.from_pretrained(
-    #         self.base_model_checkpoint, quantization_config=quantization_config
-    #     )
-
-    #     output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
-
-    #     # Check first output
-    #     decoded_output = processor.decode(output[0], skip_special_tokens=True)
-    #     expected_output = "If I had to write a haiku for this one, it would be:.\\nI'm not a poet.\\nBut I'm a photographer.\\nAnd I'm a"  # fmt: skip
-
-    #     self.assertEqual(
-    #         decoded_output,
-    #         expected_output,
-    #         f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
-    #     )
-
-    #     # Check second output
-    #     decoded_output = processor.decode(output[1], skip_special_tokens=True)
-    #     expected_output = "This image shows is a photograph of a stop sign in front of a Chinese archway. The stop sign is red with white letters and is"  # fmt: skip
-
-    #     self.assertEqual(
-    #         decoded_output,
-    #         expected_output,
-    #         f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
-    #     )
-
-    # @slow
-    # @require_mindspore_gpu
-    # @require_bitsandbytes
-    # @require_read_token
-    # def test_11b_model_integration_multi_image_generate(self):
-    #     processor = AutoProcessor.from_pretrained(self.instruct_model_checkpoint)
-
-    #     # Prepare inputs
-    #     image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
-    #     image2 = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)
-
-    #     conversation = [
-    #         {
-    #             "role": "user",
-    #             "content": [
-    #                 {"type": "image"},
-    #                 {"type": "text", "text": "What’s shown in this image?"},
-    #             ],
-    #         },
-    #         {
-    #             "role": "assistant",
-    #             "content": [
-    #                 {"type": "text", "text": "This image shows a long wooden dock extending out into a lake."}
-    #             ],
-    #         },
-    #         {
-    #             "role": "user",
-    #             "content": [
-    #                 {"type": "image"},
-    #                 {"type": "text", "text": "What about this one, what do you see here? Can you describe in detail?"},
-    #             ],
-    #         },
-    #     ]
-
-    #     prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
-    #     inputs = processor(text=prompt, images=[[image1, image2]], return_tensors="ms")
-    #     prompt_len = inputs["input_ids"].shape[-1]
-
-    #     # Load model in 4 bit
-    #     quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-    #     model = MllamaForConditionalGeneration.from_pretrained(
-    #         self.instruct_model_checkpoint, quantization_config=quantization_config
-    #     )
-
-    #     output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
-
-    #     # Check first output
-    #     generated_output = output[0][prompt_len:]
-    #     decoded_output = processor.decode(generated_output, skip_special_tokens=False)
-
-    #     # model should response about "stop sign", however it responses about "dock"
-    #     # this happens only in quantized version, bfloat16 works fine
-    #     expected_output = "This image shows a long wooden dock extending out into a lake. The dock is made of wooden planks and has a railing"
-
-    #     self.assertEqual(
-    #         decoded_output,
-    #         expected_output,
-    #         f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
-    #     )
\ No newline at end of file
diff --git a/tests/transformers/models/mllama/test_processor_mllama.py b/tests/transformers/models/mllama/test_processor_mllama.py
deleted file mode 100644
index 1419a465f..000000000
--- a/tests/transformers/models/mllama/test_processor_mllama.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from mindnlp.transformers import MllamaProcessor
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils import is_vision_available
-
-
-if is_vision_available():
-    from PIL import Image
-
-
-@require_mindspore
-@require_vision
-class MllamaProcessorTest(unittest.TestCase):
-    def setUp(self):
-        self.checkpoint = "hf-internal-testing/mllama-11b"  # TODO: change
-        self.processor = MllamaProcessor.from_pretrained(self.checkpoint)
-        self.image1 = Image.new("RGB", (224, 220))
-        self.image2 = Image.new("RGB", (512, 128))
-        self.image_token = self.processor.image_token
-        self.image_token_id = self.processor.image_token_id
-        self.pad_token_id = self.processor.tokenizer.pad_token_id
-        self.bos_token = self.processor.bos_token
-        self.bos_token_id = self.processor.tokenizer.bos_token_id
-
-    def test_apply_chat_template(self):
-        # Message contains content which a mix of lists with images and image urls and string
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image"},
-                    {"type": "image"},
-                    {"type": "text", "text": "What do these images show?"},
-                ],
-            },
-            {
-                "role": "assistant",
-                "content": [
-                    {"type": "text", "text": "The first image shows the statue of Liberty in New York."},
-                ],
-            },
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "And who is that?"},
-                ],
-            },
-        ]
-
-        rendered = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-
-        expected_rendered = (
-            "<|begin_of_text|>"
-            "<|start_header_id|>user<|end_header_id|>\n\n"
-            "<|image|><|image|>What do these images show?"
-            "<|eot_id|>"
-            "<|start_header_id|>assistant<|end_header_id|>\n\n"
-            "The first image shows the statue of Liberty in New York."
-            "<|eot_id|>"
-            "<|start_header_id|>user<|end_header_id|>\n\n"
-            "And who is that?"
-            "<|eot_id|>"
-            "<|start_header_id|>assistant<|end_header_id|>\n\n"
-        )
-        self.assertEqual(rendered, expected_rendered)
-
-        messages = [
-            {
-                "role": "system",
-                "content": [
-                    {"type": "text", "text": "This is a test sentence."},
-                ],
-            },
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "This is a response."},
-                ],
-            },
-        ]
-        input_ids = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
-        expected_ids = [
-            128000,  # <|begin_of_text|>
-            128006,  # <|start_header_id|>
-            9125,  # "system"
-            128007,  # <|end_of_header|>
-            271,  # "\n\n"
-            2028,
-            374,
-            264,
-            1296,
-            11914,
-            13,  # "This is a test sentence."
-            128009,  # <|eot_id|>
-            128006,  # <|start_header_id|>
-            882,  # "user"
-            128007,  # <|end_of_header|>
-            271,  # "\n\n"
-            2028,
-            374,
-            264,
-            2077,
-            13,  # "This is a response.",
-            128009,  # <|eot_id|>
-            128006,  # <|start_header_id|>
-            78191,  # "assistant"
-            128007,  # <|end_of_header|>
-            271,  # "\n\n"
-        ]
-
-        self.assertEqual(input_ids, expected_ids)
-
-        # test image in multiple locations
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "Describe this image in two sentences"},
-                    {"type": "image"},
-                    {"type": "text", "text": " Test sentence   "},
-                    {"type": "image"},
-                    {"type": "text", "text": "ok\n"},
-                ],
-            }
-        ]
-
-        rendered = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-        expected_rendered = (
-            "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
-            "Describe this image in two sentences<|image|> Test sentence   <|image|>ok\n<|eot_id|>"
-            "<|start_header_id|>assistant<|end_header_id|>\n\n"
-        )
-        self.assertEqual(rendered, expected_rendered)
-
-        input_ids = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
-        # fmt: off
-        expected_ids = [
-            128000, 128006, 882, 128007, 271, 75885, 420, 2217, 304, 1403, 23719, 128256,
-            3475, 11914, 262, 128256, 564, 198, 128009, 128006, 78191, 128007, 271,
-        ]
-        # fmt: on
-        self.assertEqual(input_ids, expected_ids)
-
-        # text format for content
-        messages_list = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image"},
-                    {"type": "text", "text": "Describe this image in two sentences"},
-                ],
-            }
-        ]
-        messages_str = [
-            {
-                "role": "user",
-                "content": "<|image|>Describe this image in two sentences",
-            }
-        ]
-
-        rendered_list = self.processor.apply_chat_template(messages_list, add_generation_prompt=True, tokenize=False)
-        rendered_str = self.processor.apply_chat_template(messages_str, add_generation_prompt=True, tokenize=False)
-        self.assertEqual(rendered_list, rendered_str)
\ No newline at end of file
diff --git a/tests/transformers/models/mluke/__init__.py b/tests/transformers/models/mluke/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mluke/test_tokenization_mluke.py b/tests/transformers/models/mluke/test_tokenization_mluke.py
deleted file mode 100644
index 4e862f59b..000000000
--- a/tests/transformers/models/mluke/test_tokenization_mluke.py
+++ /dev/null
@@ -1,676 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-from typing import Tuple
-
-from mindnlp.transformers.models.mluke.tokenization_mluke import MLukeTokenizer
-from mindnlp.utils.testing_utils import get_tests_dir, require_mindspore, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
-SAMPLE_ENTITY_VOCAB = get_tests_dir("fixtures/test_entity_vocab.json")
-
-
-class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "studio-ousia/mluke-base"
-    tokenizer_class = MLukeTokenizer
-    test_rust_tokenizer = False
-    from_pretrained_kwargs = {"cls_token": "<s>"}
-
-    def setUp(self):
-        super().setUp()
-
-        self.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
-
-    def get_tokenizer(self, task=None, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        kwargs.update({"task": task})
-        tokenizer = MLukeTokenizer(vocab_file=SAMPLE_VOCAB, entity_vocab_file=SAMPLE_ENTITY_VOCAB, **kwargs)
-        return tokenizer
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.get_tokenizer()
-        text = "lower newer"
-        spm_tokens = ["▁l", "ow", "er", "▁new", "er"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, spm_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_spm_tokens = [149, 116, 40, 410, 40] + [3]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_spm_tokens)
-
-    def mluke_dict_integration_testing(self):
-        tokenizer = self.get_tokenizer()
-
-        self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [35378, 8999, 38])
-        self.assertListEqual(
-            tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
-            [35378, 8999, 38, 33273, 11676, 604, 365, 21392, 201, 1819],
-        )
-
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("hf-internal-testing/tiny-random-mluke")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_text_from_decode = tokenizer.encode(
-            "sequence builders", add_special_tokens=True, add_prefix_space=False
-        )
-        encoded_pair_from_decode = tokenizer.encode(
-            "sequence builders", "multi-sequence build", add_special_tokens=True, add_prefix_space=False
-        )
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        self.assertEqual(encoded_sentence, encoded_text_from_decode)
-        self.assertEqual(encoded_pair, encoded_pair_from_decode)
-
-    def get_clean_sequence(self, tokenizer, max_length=20) -> Tuple[str, list]:
-        txt = "Beyonce lives in Los Angeles"
-        ids = tokenizer.encode(txt, add_special_tokens=False)
-        return txt, ids
-
-    def test_pretokenized_inputs(self):
-        pass
-
-    def test_embeded_special_tokens(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                sentence = "A, <mask> AllenNLP sentence."
-                tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
-                tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
-
-                # token_type_ids should put 0 everywhere
-                self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
-
-                # token_type_ids should put 0 everywhere
-                self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
-
-                # attention_mask should put 1 everywhere, so sum over length should be 1
-                self.assertEqual(
-                    sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
-                )
-
-                tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
-
-                # Rust correctly handles the space before the mask while python doesnt
-                self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
-
-                self.assertSequenceEqual(
-                    tokens_p_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
-                )
-
-    def test_padding_entity_inputs(self):
-        tokenizer = self.get_tokenizer()
-
-        sentence = "Japanese is an East Asian language spoken by about 128 million people, primarily in Japan."
-        span = (15, 34)
-        pad_id = tokenizer.entity_vocab["[PAD]"]
-        mask_id = tokenizer.entity_vocab["[MASK]"]
-
-        encoding = tokenizer([sentence, sentence], entity_spans=[[span], [span, span]], padding=True)
-        self.assertEqual(encoding["entity_ids"], [[mask_id, pad_id], [mask_id, mask_id]])
-
-        # test with a sentence with no entity
-        encoding = tokenizer([sentence, sentence], entity_spans=[[], [span, span]], padding=True)
-        self.assertEqual(encoding["entity_ids"], [[pad_id, pad_id], [mask_id, mask_id]])
-
-    def test_if_tokenize_single_text_raise_error_with_invalid_inputs(self):
-        tokenizer = self.get_tokenizer()
-
-        sentence = "ISO 639-3 uses the code fas for the dialects spoken across Iran and Afghanistan."
-        entities = ["DUMMY"]
-        spans = [(0, 9)]
-
-        with self.assertRaises(ValueError):
-            tokenizer(sentence, entities=tuple(entities), entity_spans=spans)
-
-        with self.assertRaises(ValueError):
-            tokenizer(sentence, entities=entities, entity_spans=tuple(spans))
-
-        with self.assertRaises(ValueError):
-            tokenizer(sentence, entities=[0], entity_spans=spans)
-
-        with self.assertRaises(ValueError):
-            tokenizer(sentence, entities=entities, entity_spans=[0])
-
-        with self.assertRaises(ValueError):
-            tokenizer(sentence, entities=entities, entity_spans=spans + [(0, 9)])
-
-    def test_if_tokenize_entity_classification_raise_error_with_invalid_inputs(self):
-        tokenizer = self.get_tokenizer(task="entity_classification")
-
-        sentence = "Japanese is an East Asian language spoken by about 128 million people, primarily in Japan."
-        span = (15, 34)
-
-        with self.assertRaises(ValueError):
-            tokenizer(sentence, entity_spans=[])
-
-        with self.assertRaises(ValueError):
-            tokenizer(sentence, entity_spans=[span, span])
-
-        with self.assertRaises(ValueError):
-            tokenizer(sentence, entity_spans=[0])
-
-    def test_if_tokenize_entity_pair_classification_raise_error_with_invalid_inputs(self):
-        tokenizer = self.get_tokenizer(task="entity_pair_classification")
-
-        sentence = "Japanese is an East Asian language spoken by about 128 million people, primarily in Japan."
-        # head and tail information
-
-        with self.assertRaises(ValueError):
-            tokenizer(sentence, entity_spans=[])
-
-        with self.assertRaises(ValueError):
-            tokenizer(sentence, entity_spans=[0, 0])
-
-    def test_if_tokenize_entity_span_classification_raise_error_with_invalid_inputs(self):
-        tokenizer = self.get_tokenizer(task="entity_span_classification")
-
-        sentence = "Japanese is an East Asian language spoken by about 128 million people, primarily in Japan."
-
-        with self.assertRaises(ValueError):
-            tokenizer(sentence, entity_spans=[])
-
-        with self.assertRaises(ValueError):
-            tokenizer(sentence, entity_spans=[0, 0, 0])
-
-
-@slow
-# @require_torch
-class MLukeTokenizerIntegrationTests(unittest.TestCase):
-    tokenizer_class = MLukeTokenizer
-    from_pretrained_kwargs = {"cls_token": "<s>"}
-
-    @classmethod
-    def setUpClass(cls):
-        cls.tokenizer = MLukeTokenizer.from_pretrained("studio-ousia/mluke-base", return_token_type_ids=True)
-        cls.entity_classification_tokenizer = MLukeTokenizer.from_pretrained(
-            "studio-ousia/mluke-base", return_token_type_ids=True, task="entity_classification"
-        )
-        cls.entity_pair_tokenizer = MLukeTokenizer.from_pretrained(
-            "studio-ousia/mluke-base", return_token_type_ids=True, task="entity_pair_classification"
-        )
-
-        cls.entity_span_tokenizer = MLukeTokenizer.from_pretrained(
-            "studio-ousia/mluke-base", return_token_type_ids=True, task="entity_span_classification"
-        )
-
-    def test_single_text_no_padding_or_truncation(self):
-        tokenizer = self.tokenizer
-        sentence = "ISO 639-3 uses the code fas for the dialects spoken across Iran and アフガニスタン (Afghanistan)."
-        entities = ["en:ISO 639-3", "DUMMY_ENTITY", "ja:アフガニスタン", "en:Afghanistan"]
-        spans = [(0, 9), (59, 63), (68, 75), (77, 88)]
-
-        encoding = tokenizer(sentence, entities=entities, entity_spans=spans, return_token_type_ids=True)
-
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
-            "<s> ISO 639-3 uses the code fas for the dialects spoken across Iran and アフガニスタン ( Afghanistan ).</s>",
-        )
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][1:5], spaces_between_special_tokens=False), "ISO 639-3"
-        )
-        self.assertEqual(tokenizer.decode(encoding["input_ids"][17], spaces_between_special_tokens=False), "Iran")
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][19:25], spaces_between_special_tokens=False), "アフガニスタン"
-        )
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][26], spaces_between_special_tokens=False), "Afghanistan"
-        )
-
-        self.assertEqual(
-            encoding["entity_ids"],
-            [
-                tokenizer.entity_vocab["en:ISO 639-3"],
-                tokenizer.entity_vocab["[UNK]"],
-                tokenizer.entity_vocab["ja:アフガニスタン"],
-                tokenizer.entity_vocab["en:Afghanistan"],
-            ],
-        )
-        self.assertEqual(encoding["entity_attention_mask"], [1, 1, 1, 1])
-        self.assertEqual(encoding["entity_token_type_ids"], [0, 0, 0, 0])
-        # fmt: off
-        self.assertEqual(
-            encoding["entity_position_ids"],
-            [
-                [1, 2, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [17, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [19, 20, 21, 22, 23, 24, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [26, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
-            ]
-        )
-        # fmt: on
-
-    def test_single_text_only_entity_spans_no_padding_or_truncation(self):
-        tokenizer = self.tokenizer
-
-        sentence = "ISO 639-3 uses the code fas for the dialects spoken across Iran and アフガニスタン (Afghanistan)."
-        entities = ["en:ISO 639-3", "DUMMY_ENTITY", "ja:アフガニスタン", "en:Afghanistan"]
-        spans = [(0, 9), (59, 63), (68, 75), (77, 88)]
-
-        encoding = tokenizer(sentence, entities=entities, entity_spans=spans, return_token_type_ids=True)
-
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
-            "<s> ISO 639-3 uses the code fas for the dialects spoken across Iran and アフガニスタン ( Afghanistan ).</s>",
-        )
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][1:5], spaces_between_special_tokens=False), "ISO 639-3"
-        )
-        self.assertEqual(tokenizer.decode(encoding["input_ids"][17], spaces_between_special_tokens=False), "Iran")
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][20:25], spaces_between_special_tokens=False), "アフガニスタン"
-        )
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][26], spaces_between_special_tokens=False), "Afghanistan"
-        )
-
-        self.assertEqual(
-            encoding["entity_ids"],
-            [
-                tokenizer.entity_vocab["en:ISO 639-3"],
-                tokenizer.entity_vocab["[UNK]"],
-                tokenizer.entity_vocab["ja:アフガニスタン"],
-                tokenizer.entity_vocab["en:Afghanistan"],
-            ],
-        )
-        self.assertEqual(encoding["entity_attention_mask"], [1, 1, 1, 1])
-        self.assertEqual(encoding["entity_token_type_ids"], [0, 0, 0, 0])
-        # fmt: off
-        self.assertEqual(
-            encoding["entity_position_ids"],
-            [
-                [1, 2, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [17, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [19, 20, 21, 22, 23, 24, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [26, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
-            ]
-        )
-        # fmt: on
-
-    def test_single_text_padding_pytorch_tensors(self):
-        tokenizer = self.tokenizer
-
-        sentence = "ISO 639-3 uses the code fas for the dialects spoken across Iran and アフガニスタン (Afghanistan)."
-        entities = ["en:ISO 639-3", "DUMMY_ENTITY", "ja:アフガニスタン", "en:Afghanistan"]
-        spans = [(0, 9), (59, 63), (68, 75), (77, 88)]
-
-        encoding = tokenizer(
-            sentence,
-            entities=entities,
-            entity_spans=spans,
-            return_token_type_ids=True,
-            padding="max_length",
-            max_length=30,
-            max_entity_length=16,
-            return_tensors="np",
-        )
-
-        # test words
-        self.assertEqual(encoding["input_ids"].shape, (1, 30))
-        self.assertEqual(encoding["attention_mask"].shape, (1, 30))
-        self.assertEqual(encoding["token_type_ids"].shape, (1, 30))
-
-        # test entities
-        self.assertEqual(encoding["entity_ids"].shape, (1, 16))
-        self.assertEqual(encoding["entity_attention_mask"].shape, (1, 16))
-        self.assertEqual(encoding["entity_token_type_ids"].shape, (1, 16))
-        self.assertEqual(encoding["entity_position_ids"].shape, (1, 16, tokenizer.max_mention_length))
-
-    def test_text_pair_no_padding_or_truncation(self):
-        tokenizer = self.tokenizer
-
-        sentence = "ISO 639-3 uses the code fas"
-        sentence_pair = "for the dialects spoken across Iran and アフガニスタン (Afghanistan)."
-        entities = ["en:ISO 639-3"]
-        entities_pair = ["DUMMY_ENTITY", "ja:アフガニスタン", "en:Afghanistan"]
-        spans = [(0, 9)]
-        spans_pair = [(31, 35), (40, 47), (49, 60)]
-
-        encoding = tokenizer(
-            sentence,
-            sentence_pair,
-            entities=entities,
-            entities_pair=entities_pair,
-            entity_spans=spans,
-            entity_spans_pair=spans_pair,
-            return_token_type_ids=True,
-        )
-
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
-            "<s> ISO 639-3 uses the code fas</s></s> for the dialects spoken across Iran and アフガニスタン ( Afghanistan"
-            " ).</s>",
-        )
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][1:5], spaces_between_special_tokens=False), "ISO 639-3"
-        )
-        self.assertEqual(tokenizer.decode(encoding["input_ids"][19], spaces_between_special_tokens=False), "Iran")
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][21:27], spaces_between_special_tokens=False), "アフガニスタン"
-        )
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][28], spaces_between_special_tokens=False), "Afghanistan"
-        )
-
-        self.assertEqual(
-            encoding["entity_ids"],
-            [
-                tokenizer.entity_vocab["en:ISO 639-3"],
-                tokenizer.entity_vocab["[UNK]"],
-                tokenizer.entity_vocab["ja:アフガニスタン"],
-                tokenizer.entity_vocab["en:Afghanistan"],
-            ],
-        )
-        self.assertEqual(encoding["entity_attention_mask"], [1, 1, 1, 1])
-        self.assertEqual(encoding["entity_token_type_ids"], [0, 0, 0, 0])
-        # fmt: off
-        self.assertEqual(
-            encoding["entity_position_ids"],
-            [
-                [1, 2, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [21, 22, 23, 24, 25, 26, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [28, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
-            ]
-        )
-        # fmt: on
-
-    def test_text_pair_only_entity_spans_no_padding_or_truncation(self):
-        tokenizer = self.tokenizer
-
-        sentence = "ISO 639-3 uses the code fas"
-        sentence_pair = "for the dialects spoken across Iran and アフガニスタン (Afghanistan)."
-        entities = ["en:ISO 639-3"]
-        entities_pair = ["DUMMY_ENTITY", "ja:アフガニスタン", "en:Afghanistan"]
-        spans = [(0, 9)]
-        spans_pair = [(31, 35), (40, 47), (49, 60)]
-
-        encoding = tokenizer(
-            sentence,
-            sentence_pair,
-            entities=entities,
-            entities_pair=entities_pair,
-            entity_spans=spans,
-            entity_spans_pair=spans_pair,
-            return_token_type_ids=True,
-        )
-
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
-            "<s> ISO 639-3 uses the code fas</s></s> for the dialects spoken across Iran and アフガニスタン ( Afghanistan"
-            " ).</s>",
-        )
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][1:5], spaces_between_special_tokens=False), "ISO 639-3"
-        )
-        self.assertEqual(tokenizer.decode(encoding["input_ids"][19], spaces_between_special_tokens=False), "Iran")
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][21:27], spaces_between_special_tokens=False), "アフガニスタン"
-        )
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][28], spaces_between_special_tokens=False), "Afghanistan"
-        )
-
-        self.assertEqual(
-            encoding["entity_ids"],
-            [
-                tokenizer.entity_vocab["en:ISO 639-3"],
-                tokenizer.entity_vocab["[UNK]"],
-                tokenizer.entity_vocab["ja:アフガニスタン"],
-                tokenizer.entity_vocab["en:Afghanistan"],
-            ],
-        )
-        # fmt: off
-        self.assertEqual(
-            encoding["entity_position_ids"],
-            [
-                [1, 2, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [21, 22, 23, 24, 25, 26, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [28, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
-            ]
-        )
-        # fmt: on
-
-    def test_text_pair_padding_pytorch_tensors(self):
-        tokenizer = self.tokenizer
-
-        sentence = "ISO 639-3 uses the code fas"
-        sentence_pair = "for the dialects spoken across Iran and アフガニスタン (Afghanistan)."
-        entities = ["en:ISO 639-3"]
-        entities_pair = ["DUMMY_ENTITY", "ja:アフガニスタン", "en:Afghanistan"]
-        spans = [(0, 9)]
-        spans_pair = [(31, 35), (40, 47), (49, 60)]
-
-        encoding = tokenizer(
-            sentence,
-            sentence_pair,
-            entities=entities,
-            entities_pair=entities_pair,
-            entity_spans=spans,
-            entity_spans_pair=spans_pair,
-            return_token_type_ids=True,
-            padding="max_length",
-            max_length=40,
-            max_entity_length=16,
-            return_tensors="np",
-        )
-
-        # test words
-        self.assertEqual(encoding["input_ids"].shape, (1, 40))
-        self.assertEqual(encoding["attention_mask"].shape, (1, 40))
-        self.assertEqual(encoding["token_type_ids"].shape, (1, 40))
-
-        # test entities
-        self.assertEqual(encoding["entity_ids"].shape, (1, 16))
-        self.assertEqual(encoding["entity_attention_mask"].shape, (1, 16))
-        self.assertEqual(encoding["entity_token_type_ids"].shape, (1, 16))
-        self.assertEqual(encoding["entity_position_ids"].shape, (1, 16, tokenizer.max_mention_length))
-
-    def test_entity_classification_no_padding_or_truncation(self):
-        tokenizer = self.entity_classification_tokenizer
-
-        sentence = "Japanese is an East Asian language spoken by about 128 million people, primarily in Japan."
-        span = (15, 34)
-
-        encoding = tokenizer(sentence, entity_spans=[span], return_token_type_ids=True)
-
-        # test words
-        self.assertEqual(len(encoding["input_ids"]), 23)
-        self.assertEqual(len(encoding["attention_mask"]), 23)
-        self.assertEqual(len(encoding["token_type_ids"]), 23)
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
-            "<s> Japanese is an<ent>East Asian language<ent>spoken by about 128 million people, primarily in"
-            " Japan.</s>",
-        )
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][4:9], spaces_between_special_tokens=False),
-            "<ent>East Asian language<ent>",
-        )
-
-        # test entities
-        mask_id = tokenizer.entity_vocab["[MASK]"]
-        self.assertEqual(encoding["entity_ids"], [mask_id])
-        self.assertEqual(encoding["entity_attention_mask"], [1])
-        self.assertEqual(encoding["entity_token_type_ids"], [0])
-        # fmt: off
-        self.assertEqual(
-            encoding["entity_position_ids"],
-            [[4, 5, 6, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]]
-        )
-        # fmt: on
-
-    def test_entity_classification_padding_pytorch_tensors(self):
-        tokenizer = self.entity_classification_tokenizer
-
-        sentence = "Japanese is an East Asian language spoken by about 128 million people, primarily in Japan."
-        span = (15, 34)
-
-        encoding = tokenizer(
-            sentence, entity_spans=[span], return_token_type_ids=True, padding="max_length", return_tensors="np"
-        )
-
-        # test words
-        self.assertEqual(encoding["input_ids"].shape, (1, 512))
-        self.assertEqual(encoding["attention_mask"].shape, (1, 512))
-        self.assertEqual(encoding["token_type_ids"].shape, (1, 512))
-
-        # test entities
-        self.assertEqual(encoding["entity_ids"].shape, (1, 1))
-        self.assertEqual(encoding["entity_attention_mask"].shape, (1, 1))
-        self.assertEqual(encoding["entity_token_type_ids"].shape, (1, 1))
-        self.assertEqual(
-            encoding["entity_position_ids"].shape, (1, tokenizer.max_entity_length, tokenizer.max_mention_length)
-        )
-
-    def test_entity_pair_classification_no_padding_or_truncation(self):
-        tokenizer = self.entity_pair_tokenizer
-
-        sentence = "Japanese is an East Asian language spoken by about 128 million people, primarily in Japan."
-        # head and tail information
-        spans = [(0, 8), (84, 89)]
-
-        encoding = tokenizer(sentence, entity_spans=spans, return_token_type_ids=True)
-
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
-            "<s><ent>Japanese<ent>is an East Asian language spoken by about 128 million people, primarily"
-            " in<ent2>Japan<ent2>.</s>",
-        )
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][1:4], spaces_between_special_tokens=False),
-            "<ent>Japanese<ent>",
-        )
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"][20:23], spaces_between_special_tokens=False), "<ent2>Japan<ent2>"
-        )
-
-        mask_id = tokenizer.entity_vocab["[MASK]"]
-        mask2_id = tokenizer.entity_vocab["[MASK2]"]
-        self.assertEqual(encoding["entity_ids"], [mask_id, mask2_id])
-        self.assertEqual(encoding["entity_attention_mask"], [1, 1])
-        self.assertEqual(encoding["entity_token_type_ids"], [0, 0])
-        # fmt: off
-        self.assertEqual(
-            encoding["entity_position_ids"],
-            [
-                [1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [20, 21, 22, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
-            ]
-        )
-        # fmt: on
-
-    def test_entity_pair_classification_padding_pytorch_tensors(self):
-        tokenizer = self.entity_pair_tokenizer
-
-        sentence = "Japanese is an East Asian language spoken by about 128 million people, primarily in Japan."
-        # head and tail information
-        spans = [(0, 8), (84, 89)]
-
-        encoding = tokenizer(
-            sentence,
-            entity_spans=spans,
-            return_token_type_ids=True,
-            padding="max_length",
-            max_length=30,
-            return_tensors="np",
-        )
-
-        # test words
-        self.assertEqual(encoding["input_ids"].shape, (1, 30))
-        self.assertEqual(encoding["attention_mask"].shape, (1, 30))
-        self.assertEqual(encoding["token_type_ids"].shape, (1, 30))
-
-        # test entities
-        self.assertEqual(encoding["entity_ids"].shape, (1, 2))
-        self.assertEqual(encoding["entity_attention_mask"].shape, (1, 2))
-        self.assertEqual(encoding["entity_token_type_ids"].shape, (1, 2))
-        self.assertEqual(
-            encoding["entity_position_ids"].shape, (1, tokenizer.max_entity_length, tokenizer.max_mention_length)
-        )
-
-    def test_entity_span_classification_no_padding_or_truncation(self):
-        tokenizer = self.entity_span_tokenizer
-
-        sentence = "Japanese is an East Asian language spoken by about 128 million people, primarily in Japan."
-        spans = [(0, 8), (15, 34), (84, 89)]
-
-        encoding = tokenizer(sentence, entity_spans=spans, return_token_type_ids=True)
-
-        self.assertEqual(
-            tokenizer.decode(encoding["input_ids"], spaces_between_special_tokens=False),
-            "<s> Japanese is an East Asian language spoken by about 128 million people, primarily in Japan.</s>",
-        )
-
-        mask_id = tokenizer.entity_vocab["[MASK]"]
-        self.assertEqual(encoding["entity_ids"], [mask_id, mask_id, mask_id])
-        self.assertEqual(encoding["entity_attention_mask"], [1, 1, 1])
-        self.assertEqual(encoding["entity_token_type_ids"], [0, 0, 0])
-        # fmt: off
-        self.assertEqual(
-            encoding["entity_position_ids"],
-            [
-                [1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [4, 5, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
-                [18, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]]
-        )
-        # fmt: on
-        self.assertEqual(encoding["entity_start_positions"], [1, 4, 18])
-        self.assertEqual(encoding["entity_end_positions"], [1, 6, 18])
-
-    def test_entity_span_classification_padding_pytorch_tensors(self):
-        tokenizer = self.entity_span_tokenizer
-
-        sentence = "Japanese is an East Asian language spoken by about 128 million people, primarily in Japan."
-        spans = [(0, 8), (15, 34), (84, 89)]
-
-        encoding = tokenizer(
-            sentence,
-            entity_spans=spans,
-            return_token_type_ids=True,
-            padding="max_length",
-            max_length=30,
-            max_entity_length=16,
-            return_tensors="np",
-        )
-
-        # test words
-        self.assertEqual(encoding["input_ids"].shape, (1, 30))
-        self.assertEqual(encoding["attention_mask"].shape, (1, 30))
-        self.assertEqual(encoding["token_type_ids"].shape, (1, 30))
-
-        # test entities
-        self.assertEqual(encoding["entity_ids"].shape, (1, 16))
-        self.assertEqual(encoding["entity_attention_mask"].shape, (1, 16))
-        self.assertEqual(encoding["entity_token_type_ids"].shape, (1, 16))
-        self.assertEqual(encoding["entity_position_ids"].shape, (1, 16, tokenizer.max_mention_length))
-        self.assertEqual(encoding["entity_start_positions"].shape, (1, 16))
-        self.assertEqual(encoding["entity_end_positions"].shape, (1, 16))
diff --git a/tests/transformers/models/mobilebert/__init__.py b/tests/transformers/models/mobilebert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mobilenet_v1/__init__.py b/tests/transformers/models/mobilenet_v1/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mobilenet_v1/test_image_processing_mobilenet_v1.py b/tests/transformers/models/mobilenet_v1/test_image_processing_mobilenet_v1.py
deleted file mode 100644
index 36a160f32..000000000
--- a/tests/transformers/models/mobilenet_v1/test_image_processing_mobilenet_v1.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================
-
-import unittest
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils import is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_vision_available():
-    from mindnlp.transformers import MobileNetV1ImageProcessor
-
-
-class MobileNetV1ImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-    ):
-        size = size if size is not None else {"shortest_edge": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class MobileNetV1ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = MobileNetV1ImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = MobileNetV1ImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_center_crop"))
-        self.assertTrue(hasattr(image_processing, "center_crop"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 20})
-        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
-        self.assertEqual(image_processor.size, {"shortest_edge": 42})
-        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
diff --git a/tests/transformers/models/mobilenet_v1/test_modeling_mobilenet_v1.py b/tests/transformers/models/mobilenet_v1/test_modeling_mobilenet_v1.py
deleted file mode 100644
index aed038577..000000000
--- a/tests/transformers/models/mobilenet_v1/test_modeling_mobilenet_v1.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore MobileNetV1 model."""
-
-import unittest
-
-from mindnlp.transformers import MobileNetV1Config
-from mindnlp.utils.testing_utils import is_flaky, require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import MobileNetV1ForImageClassification, MobileNetV1Model
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import MobileNetV1ImageProcessor
-
-
-class MobileNetV1ConfigTester(ConfigTester):
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, "tf_padding"))
-        self.parent.assertTrue(hasattr(config, "depth_multiplier"))
-
-
-class MobileNetV1ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        num_channels=3,
-        image_size=32,
-        depth_multiplier=0.25,
-        min_depth=8,
-        tf_padding=True,
-        last_hidden_size=1024,
-        output_stride=32,
-        hidden_act="relu6",
-        classifier_dropout_prob=0.1,
-        initializer_range=0.02,
-        is_training=True,
-        use_labels=True,
-        num_labels=10,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.depth_multiplier = depth_multiplier
-        self.min_depth = min_depth
-        self.tf_padding = tf_padding
-        self.last_hidden_size = int(last_hidden_size * depth_multiplier)
-        self.output_stride = output_stride
-        self.hidden_act = hidden_act
-        self.classifier_dropout_prob = classifier_dropout_prob
-        self.use_labels = use_labels
-        self.is_training = is_training
-        self.num_labels = num_labels
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        pixel_labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-            pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels, pixel_labels
-
-    def get_config(self):
-        return MobileNetV1Config(
-            num_channels=self.num_channels,
-            image_size=self.image_size,
-            depth_multiplier=self.depth_multiplier,
-            min_depth=self.min_depth,
-            tf_padding=self.tf_padding,
-            hidden_act=self.hidden_act,
-            classifier_dropout_prob=self.classifier_dropout_prob,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
-        model = MobileNetV1Model(config=config)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (
-                self.batch_size,
-                self.last_hidden_size,
-                self.image_size // self.output_stride,
-                self.image_size // self.output_stride,
-            ),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.num_labels
-        model = MobileNetV1ForImageClassification(config)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels, pixel_labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class MobileNetV1ModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as MobileNetV1 does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (MobileNetV1Model, MobileNetV1ForImageClassification) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"image-feature-extraction": MobileNetV1Model, "image-classification": MobileNetV1ForImageClassification}
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = MobileNetV1ModelTester(self)
-        self.config_tester = MobileNetV1ConfigTester(self, config_class=MobileNetV1Config, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="MobileNetV1 does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="MobileNetV1 does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="MobileNetV1 does not output attentions")
-    def test_attention_outputs(self):
-        pass
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_stages = 26
-            self.assertEqual(len(hidden_states), expected_num_stages)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/mobilenet_v1_1.0_224"
-        model = MobileNetV1Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip
-    def test_batching_equivalence(self):
-        super().test_batching_equivalence()
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class MobileNetV1ModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            MobileNetV1ImageProcessor.from_pretrained("google/mobilenet_v1_1.0_224") if is_vision_available() else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = MobileNetV1ForImageClassification.from_pretrained("google/mobilenet_v1_1.0_224")
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1001)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-4.1739, -1.1233, 3.1205])
-        print(outputs.logits[0, :3])
-        self.assertTrue(ops.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
diff --git a/tests/transformers/models/mobilenet_v2/__init__.py b/tests/transformers/models/mobilenet_v2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mobilenet_v2/test_image_processing_mobilenet_v2.py b/tests/transformers/models/mobilenet_v2/test_image_processing_mobilenet_v2.py
deleted file mode 100644
index 16009b64f..000000000
--- a/tests/transformers/models/mobilenet_v2/test_image_processing_mobilenet_v2.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================
-
-
-import unittest
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils import is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_vision_available():
-    from mindnlp.transformers import MobileNetV2ImageProcessor
-
-
-class MobileNetV2ImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-    ):
-        size = size if size is not None else {"shortest_edge": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class MobileNetV2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = MobileNetV2ImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = MobileNetV2ImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processor, "do_resize"))
-        self.assertTrue(hasattr(image_processor, "size"))
-        self.assertTrue(hasattr(image_processor, "do_center_crop"))
-        self.assertTrue(hasattr(image_processor, "crop_size"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 20})
-        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
-        self.assertEqual(image_processor.size, {"shortest_edge": 42})
-        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
\ No newline at end of file
diff --git a/tests/transformers/models/mobilenet_v2/test_modeling_mobilenet_v2.py b/tests/transformers/models/mobilenet_v2/test_modeling_mobilenet_v2.py
deleted file mode 100644
index fbbf42d1a..000000000
--- a/tests/transformers/models/mobilenet_v2/test_modeling_mobilenet_v2.py
+++ /dev/null
@@ -1,325 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================
-"""Testing suite for the Mindspore MobileNetV2 model."""
-
-import unittest
-import numpy as np
-
-from mindnlp.transformers import MobileNetV2Config
-from mindnlp.utils.testing_utils import is_flaky, require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindspore import ops
-    from mindspore import context
-
-    from mindnlp.transformers import MobileNetV2ForImageClassification, MobileNetV2ForSemanticSegmentation, MobileNetV2Model
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import MobileNetV2ImageProcessor
-
-
-class MobileNetV2ConfigTester(ConfigTester):
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, "tf_padding"))
-        self.parent.assertTrue(hasattr(config, "depth_multiplier"))
-
-
-class MobileNetV2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        num_channels=3,
-        image_size=32,
-        depth_multiplier=0.25,
-        depth_divisible_by=8,
-        min_depth=8,
-        expand_ratio=6,
-        output_stride=32,
-        first_layer_is_expansion=True,
-        finegrained_output=True,
-        tf_padding=True,
-        hidden_act="relu6",
-        last_hidden_size=1280,
-        classifier_dropout_prob=0.1,
-        initializer_range=0.02,
-        is_training=True,
-        use_labels=True,
-        num_labels=10,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.depth_multiplier = depth_multiplier
-        self.depth_divisible_by = depth_divisible_by
-        self.min_depth = min_depth
-        self.expand_ratio = expand_ratio
-        self.tf_padding = tf_padding
-        self.output_stride = output_stride
-        self.first_layer_is_expansion = first_layer_is_expansion
-        self.finegrained_output = finegrained_output
-        self.hidden_act = hidden_act
-        self.last_hidden_size = last_hidden_size if finegrained_output else int(last_hidden_size * depth_multiplier)
-        self.classifier_dropout_prob = classifier_dropout_prob
-        self.use_labels = use_labels
-        self.is_training = is_training
-        self.num_labels = num_labels
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        pixel_labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-            pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels, pixel_labels
-
-    def get_config(self):
-        return MobileNetV2Config(
-            num_channels=self.num_channels,
-            image_size=self.image_size,
-            depth_multiplier=self.depth_multiplier,
-            depth_divisible_by=self.depth_divisible_by,
-            min_depth=self.min_depth,
-            expand_ratio=self.expand_ratio,
-            output_stride=self.output_stride,
-            first_layer_is_expansion=self.first_layer_is_expansion,
-            finegrained_output=self.finegrained_output,
-            hidden_act=self.hidden_act,
-            tf_padding=self.tf_padding,
-            classifier_dropout_prob=self.classifier_dropout_prob,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
-        model = MobileNetV2Model(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (
-                self.batch_size,
-                self.last_hidden_size,
-                self.image_size // self.output_stride,
-                self.image_size // self.output_stride,
-            ),
-        )
-        self.parent.assertEqual(
-            result.pooler_output.shape,
-            (self.batch_size, self.last_hidden_size),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.num_labels
-        model = MobileNetV2ForImageClassification(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.num_labels
-        model = MobileNetV2ForSemanticSegmentation(config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.logits.shape,
-            (
-                self.batch_size,
-                self.num_labels,
-                self.image_size // self.output_stride,
-                self.image_size // self.output_stride,
-            ),
-        )
-        result = model(pixel_values, labels=pixel_labels)
-        self.parent.assertEqual(
-            result.logits.shape,
-            (
-                self.batch_size,
-                self.num_labels,
-                self.image_size // self.output_stride,
-                self.image_size // self.output_stride,
-            ),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels, pixel_labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class MobileNetV2ModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as MobileNetV2 does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        ( MobileNetV2ForImageClassification, MobileNetV2ForSemanticSegmentation)
-        if is_mindspore_available()
-        else ()
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = MobileNetV2ModelTester(self)
-        self.config_tester = MobileNetV2ConfigTester(self, config_class=MobileNetV2Config, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="MobileNetV2 does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="MobileNetV2 does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="MobileNetV2 does not output attentions")
-    def test_attention_outputs(self):
-        pass
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_stages = 16
-            self.assertEqual(len(hidden_states), expected_num_stages)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    def test_for_semantic_segmentation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/mobilenet_v2_1.4_224"
-        model = MobileNetV2Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip
-    def test_batching_equivalence(self):
-        super().test_batching_equivalence()
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class MobileNetV2ModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            MobileNetV2ImageProcessor.from_pretrained("google/mobilenet_v2_1.0_224") if is_vision_available() else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = MobileNetV2ForImageClassification.from_pretrained("google/mobilenet_v2_1.0_224")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1001)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = ms.tensor([0.2445, -1.1993, 0.1905])
-
-        self.assertTrue(np.allclose(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-    @slow
-    def test_inference_semantic_segmentation(self):
-        model = MobileNetV2ForSemanticSegmentation.from_pretrained("google/deeplabv3_mobilenet_v2_1.0_513")
-
-        image_processor = MobileNetV2ImageProcessor.from_pretrained("google/deeplabv3_mobilenet_v2_1.0_513")
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 21, 65, 65)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = ms.tensor(
-            [
-                [[17.5790, 17.7581, 18.3355], [18.3257, 18.4230, 18.8973], [18.6169, 18.8650, 19.2187]],
-                [[-2.1595, -2.0977, -2.3741], [-2.4226, -2.3028, -2.6835], [-2.7819, -2.5991, -2.7706]],
-                [[4.2058, 4.8317, 4.7638], [4.4136, 5.0361, 4.9383], [4.5028, 4.9644, 4.8734]],
-            ]
-        )
-
-        self.assertTrue(np.allclose(logits[0, :3, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/mobilevit/__init__.py b/tests/transformers/models/mobilevit/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mobilevit/test_image_processing_mobilevit.py b/tests/transformers/models/mobilevit/test_image_processing_mobilevit.py
deleted file mode 100644
index e9d8a34c0..000000000
--- a/tests/transformers/models/mobilevit/test_image_processing_mobilevit.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================
-"""Testing suite for the Image Processing of MobileVit."""
-
-import unittest
-
-from datasets import load_dataset
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindspore import ops
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import MobileViTImageProcessor
-
-
-class MobileViTImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_flip_channel_order=True,
-    ):
-        size = size if size is not None else {"shortest_edge": 20}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size
-        self.do_flip_channel_order = do_flip_channel_order
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "do_flip_channel_order": self.do_flip_channel_order,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-@slow
-def prepare_semantic_single_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-
-    image = Image.open(dataset[0]["file"])
-    map = Image.open(dataset[1]["file"])
-
-    return image, map
-
-@slow
-def prepare_semantic_batch_inputs():
-    dataset = load_dataset("hf-internal-testing/fixtures_ade20k", split="test")
-
-    image1 = Image.open(dataset[0]["file"])
-    map1 = Image.open(dataset[1]["file"])
-    image2 = Image.open(dataset[2]["file"])
-    map2 = Image.open(dataset[3]["file"])
-
-    return [image1, image2], [map1, map2]
-
-
-@require_mindspore
-@require_vision
-class MobileViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = MobileViTImageProcessor
-
-    def setUp(self):
-        self.image_processor_tester = MobileViTImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_center_crop"))
-        self.assertTrue(hasattr(image_processing, "center_crop"))
-        self.assertTrue(hasattr(image_processing, "do_flip_channel_order"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 20})
-        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
-        self.assertEqual(image_processor.size, {"shortest_edge": 42})
-        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
-
-    def test_call_segmentation_maps(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
-        maps = []
-        for image in image_inputs:
-            self.assertIsInstance(image, ms.Tensor)
-            maps.append(ops.zeros(image.shape[-2:]).astype(ms.int64))
-
-        # Test not batched input
-        encoding = image_processing(image_inputs[0], maps[0], return_tensors="ms")
-        self.assertEqual(
-            encoding["pixel_values"].shape,
-            (
-                1,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(
-            encoding["labels"].shape,
-            (
-                1,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(encoding["labels"].dtype, ms.int64)
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 255)
-
-        # Test batched
-        encoding = image_processing(image_inputs, maps, return_tensors="ms")
-        self.assertEqual(
-            encoding["pixel_values"].shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(
-            encoding["labels"].shape,
-            (
-                self.image_processor_tester.batch_size,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(encoding["labels"].dtype, ms.int64)
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 255)
-
-        # Test not batched input (PIL images)
-        image, segmentation_map = prepare_semantic_single_inputs()
-
-        encoding = image_processing(image, segmentation_map, return_tensors="ms")
-        self.assertEqual(
-            encoding["pixel_values"].shape,
-            (
-                1,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(
-            encoding["labels"].shape,
-            (
-                1,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(encoding["labels"].dtype, ms.int64)
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 255)
-
-        # Test batched input (PIL images)
-        images, segmentation_maps = prepare_semantic_batch_inputs()
-
-        encoding = image_processing(images, segmentation_maps, return_tensors="ms")
-        self.assertEqual(
-            encoding["pixel_values"].shape,
-            (
-                2,
-                self.image_processor_tester.num_channels,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(
-            encoding["labels"].shape,
-            (
-                2,
-                self.image_processor_tester.crop_size["height"],
-                self.image_processor_tester.crop_size["width"],
-            ),
-        )
-        self.assertEqual(encoding["labels"].dtype, ms.int64)
-        self.assertTrue(encoding["labels"].min().item() >= 0)
-        self.assertTrue(encoding["labels"].max().item() <= 255)
diff --git a/tests/transformers/models/mobilevit/test_modeling_mobilevit.py b/tests/transformers/models/mobilevit/test_modeling_mobilevit.py
deleted file mode 100644
index c4b8e6310..000000000
--- a/tests/transformers/models/mobilevit/test_modeling_mobilevit.py
+++ /dev/null
@@ -1,361 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore MobileViT model."""
-
-import unittest
-
-from mindnlp.transformers import MobileViTConfig
-from mindnlp.utils.testing_utils import is_flaky, require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import MobileViTForImageClassification, MobileViTForSemanticSegmentation, MobileViTModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import MobileViTImageProcessor
-
-
-class MobileViTConfigTester(ConfigTester):
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, "hidden_sizes"))
-        self.parent.assertTrue(hasattr(config, "neck_hidden_sizes"))
-        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
-
-
-class MobileViTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=32,
-        patch_size=2,
-        num_channels=3,
-        last_hidden_size=32,
-        num_attention_heads=4,
-        hidden_act="silu",
-        conv_kernel_size=3,
-        output_stride=32,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        classifier_dropout_prob=0.1,
-        initializer_range=0.02,
-        is_training=True,
-        use_labels=True,
-        num_labels=10,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.last_hidden_size = last_hidden_size
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.conv_kernel_size = conv_kernel_size
-        self.output_stride = output_stride
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.classifier_dropout_prob = classifier_dropout_prob
-        self.use_labels = use_labels
-        self.is_training = is_training
-        self.num_labels = num_labels
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        pixel_labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-            pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels, pixel_labels
-
-    def get_config(self):
-        return MobileViTConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            num_attention_heads=self.num_attention_heads,
-            hidden_act=self.hidden_act,
-            conv_kernel_size=self.conv_kernel_size,
-            output_stride=self.output_stride,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            classifier_dropout_prob=self.classifier_dropout_prob,
-            initializer_range=self.initializer_range,
-            hidden_sizes=[12, 16, 20],
-            neck_hidden_sizes=[8, 8, 16, 16, 32, 32, 32],
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
-        model = MobileViTModel(config=config)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (
-                self.batch_size,
-                self.last_hidden_size,
-                self.image_size // self.output_stride,
-                self.image_size // self.output_stride,
-            ),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.num_labels
-        model = MobileViTForImageClassification(config)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.num_labels
-        model = MobileViTForSemanticSegmentation(config)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.logits.shape,
-            (
-                self.batch_size,
-                self.num_labels,
-                self.image_size // self.output_stride,
-                self.image_size // self.output_stride,
-            ),
-        )
-        result = model(pixel_values, labels=pixel_labels)
-        self.parent.assertEqual(
-            result.logits.shape,
-            (
-                self.batch_size,
-                self.num_labels,
-                self.image_size // self.output_stride,
-                self.image_size // self.output_stride,
-            ),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels, pixel_labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class MobileViTModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as MobileViT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (MobileViTModel, MobileViTForImageClassification, MobileViTForSemanticSegmentation)
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "image-feature-extraction": MobileViTModel,
-            "image-classification": MobileViTForImageClassification,
-            "image-segmentation": MobileViTForSemanticSegmentation,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = MobileViTModelTester(self)
-        self.config_tester = MobileViTConfigTester(self, config_class=MobileViTConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="MobileViT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="MobileViT does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="MobileViT does not output attentions")
-    def test_attention_outputs(self):
-        pass
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_stages = 5
-            self.assertEqual(len(hidden_states), expected_num_stages)
-
-            # MobileViT's feature maps are of shape (batch_size, num_channels, height, width)
-            # with the width and height being successively divided by 2.
-            divisor = 2
-            for i in range(len(hidden_states)):
-                self.assertListEqual(
-                    list(hidden_states[i].shape[-2:]),
-                    [self.model_tester.image_size // divisor, self.model_tester.image_size // divisor],
-                )
-                divisor *= 2
-
-            self.assertEqual(self.model_tester.output_stride, divisor // 2)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    def test_for_semantic_segmentation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "apple/mobilevit-small"
-        model = MobileViTModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
-    def test_batching_equivalence(self):
-        super().test_batching_equivalence()
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class MobileViTModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return MobileViTImageProcessor.from_pretrained("apple/mobilevit-xx-small") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = MobileViTForImageClassification.from_pretrained("apple/mobilevit-xx-small")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-1.9364, -1.2327, -0.4653])
-
-        self.assertTrue(ops.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_semantic_segmentation(self):
-        model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
-
-        image_processor = MobileViTImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 21, 32, 32)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [
-                [[6.9713, 6.9786, 7.2422], [7.2893, 7.2825, 7.4446], [7.6580, 7.8797, 7.9420]],
-                [[-10.6869, -10.3250, -10.3471], [-10.4228, -9.9868, -9.7132], [-11.0405, -11.0221, -10.7318]],
-                [[-3.3089, -2.8539, -2.6740], [-3.2706, -2.5621, -2.5108], [-3.2534, -2.6615, -2.6651]],
-            ],
-        )
-
-        self.assertTrue(ops.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_post_processing_semantic_segmentation(self):
-        model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
-
-        image_processor = MobileViTImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-xx-small")
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        outputs.logits = outputs.logits.detach().cpu()
-
-        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(50, 60)])
-        expected_shape = (50, 60)
-        self.assertEqual(segmentation[0].shape, expected_shape)
-
-        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
-        expected_shape = (32, 32)
-        self.assertEqual(segmentation[0].shape, expected_shape)
\ No newline at end of file
diff --git a/tests/transformers/models/mobilevitv2/__init__.py b/tests/transformers/models/mobilevitv2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mobilevitv2/test_modeling_mobilevitv2.py b/tests/transformers/models/mobilevitv2/test_modeling_mobilevitv2.py
deleted file mode 100644
index d1e53171c..000000000
--- a/tests/transformers/models/mobilevitv2/test_modeling_mobilevitv2.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch MobileViTV2 model."""
-
-import unittest
-from mindnlp.transformers import MobileViTV2Config
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.core import ops, no_grad
-    from mindnlp.transformers import MobileViTV2ForImageClassification, MobileViTV2ForSemanticSegmentation, MobileViTV2Model
-    from mindnlp.transformers.models.mobilevitv2.modeling_mobilevitv2 import (
-        make_divisible,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import MobileViTImageProcessor
-
-
-class MobileViTV2ConfigTester(ConfigTester):
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, "width_multiplier"))
-
-
-class MobileViTV2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=64,
-        patch_size=2,
-        num_channels=3,
-        hidden_act="swish",
-        conv_kernel_size=3,
-        output_stride=32,
-        classifier_dropout_prob=0.1,
-        initializer_range=0.02,
-        is_training=True,
-        use_labels=True,
-        num_labels=10,
-        scope=None,
-        width_multiplier=0.25,
-        ffn_dropout=0.0,
-        attn_dropout=0.0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.last_hidden_size = make_divisible(512 * width_multiplier, divisor=8)
-        self.hidden_act = hidden_act
-        self.conv_kernel_size = conv_kernel_size
-        self.output_stride = output_stride
-        self.classifier_dropout_prob = classifier_dropout_prob
-        self.use_labels = use_labels
-        self.is_training = is_training
-        self.num_labels = num_labels
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.width_multiplier = width_multiplier
-        self.ffn_dropout_prob = ffn_dropout
-        self.attn_dropout_prob = attn_dropout
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        pixel_labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-            pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels, pixel_labels
-
-    def get_config(self):
-        return MobileViTV2Config(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_act=self.hidden_act,
-            conv_kernel_size=self.conv_kernel_size,
-            output_stride=self.output_stride,
-            classifier_dropout_prob=self.classifier_dropout_prob,
-            initializer_range=self.initializer_range,
-            width_multiplier=self.width_multiplier,
-            ffn_dropout=self.ffn_dropout_prob,
-            attn_dropout=self.attn_dropout_prob,
-            base_attn_unit_dims=[16, 24, 32],
-            n_attn_blocks=[1, 1, 2],
-            aspp_out_channels=32,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
-        model = MobileViTV2Model(config=config)
-
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (
-                self.batch_size,
-                self.last_hidden_size,
-                self.image_size // self.output_stride,
-                self.image_size // self.output_stride,
-            ),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.num_labels
-        model = MobileViTV2ForImageClassification(config)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels, pixel_labels):
-        config.num_labels = self.num_labels
-        model = MobileViTV2ForSemanticSegmentation(config)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.logits.shape,
-            (
-                self.batch_size,
-                self.num_labels,
-                self.image_size // self.output_stride,
-                self.image_size // self.output_stride,
-            ),
-        )
-        result = model(pixel_values, labels=pixel_labels)
-        self.parent.assertEqual(
-            result.logits.shape,
-            (
-                self.batch_size,
-                self.num_labels,
-                self.image_size // self.output_stride,
-                self.image_size // self.output_stride,
-            ),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels, pixel_labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class MobileViTV2ModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as MobileViTV2 does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (MobileViTV2Model, MobileViTV2ForImageClassification, MobileViTV2ForSemanticSegmentation)
-        if is_mindspore_available()
-        else ()
-    )
-
-    pipeline_model_mapping = (
-        {
-            "image-feature-extraction": MobileViTV2Model,
-            "image-classification": MobileViTV2ForImageClassification,
-            "image-segmentation": MobileViTV2ForSemanticSegmentation,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = MobileViTV2ModelTester(self)
-        self.config_tester = MobileViTV2ConfigTester(self, config_class=MobileViTV2Config, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="MobileViTV2 does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="MobileViTV2 does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="MobileViTV2 does not output attentions")
-    def test_attention_outputs(self):
-        pass
-
-    # @require_torch_multi_gpu
-    @unittest.skip(reason="Got `CUDA error: misaligned address` for tests after this one being run.")
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_stages = 5
-            self.assertEqual(len(hidden_states), expected_num_stages)
-
-            # MobileViTV2's feature maps are of shape (batch_size, num_channels, height, width)
-            # with the width and height being successively divided by 2.
-            divisor = 2
-            for i in range(len(hidden_states)):
-                self.assertListEqual(
-                    list(hidden_states[i].shape[-2:]),
-                    [self.model_tester.image_size // divisor, self.model_tester.image_size // divisor],
-                )
-                divisor *= 2
-
-            self.assertEqual(self.model_tester.output_stride, divisor // 2)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    def test_for_semantic_segmentation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "apple/mobilevitv2-1.0-imagenet1k-256"
-        model = MobileViTV2Model.from_pretrained(model_name, from_pt=True)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class MobileViTV2ModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            MobileViTImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = MobileViTV2ForImageClassification.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-1.6336e00, -7.3204e-02, -5.1883e-01])
-
-        self.assertTrue(ops.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_semantic_segmentation(self):
-        model = MobileViTV2ForSemanticSegmentation.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3")
-
-        image_processor = MobileViTImageProcessor.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3")
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 21, 32, 32)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [
-                [[7.0863, 7.1525, 6.8201], [6.6931, 6.8770, 6.8933], [6.2978, 7.0366, 6.9636]],
-                [[-3.7134, -3.6712, -3.6675], [-3.5825, -3.3549, -3.4777], [-3.3435, -3.3979, -3.2857]],
-                [[-2.9329, -2.8003, -2.7369], [-3.0564, -2.4780, -2.0207], [-2.6889, -1.9298, -1.7640]],
-            ],
-        )
-
-        self.assertTrue(ops.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-3))
-
-    @slow
-    def test_post_processing_semantic_segmentation(self):
-        model = MobileViTV2ForSemanticSegmentation.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3")
-
-        image_processor = MobileViTImageProcessor.from_pretrained("shehan97/mobilevitv2-1.0-voc-deeplabv3")
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        outputs.logits = mindspore.ops.stop_gradient(outputs.logits)
-
-        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(50, 60)])
-        expected_shape = (50, 60)
-        self.assertEqual(segmentation[0].shape, expected_shape)
-
-        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
-        expected_shape = (32, 32)
-        self.assertEqual(segmentation[0].shape, expected_shape)
diff --git a/tests/transformers/models/mpnet/__init__.py b/tests/transformers/models/mpnet/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mpnet/test_modeling_mpnet.py b/tests/transformers/models/mpnet/test_modeling_mpnet.py
deleted file mode 100644
index d8768cd08..000000000
--- a/tests/transformers/models/mpnet/test_modeling_mpnet.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuaWei Technologies Co., Microsoft Corporation.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import numpy as np
-
-from mindnlp.transformers import MPNetConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import (
-        MPNetForMaskedLM,
-        MPNetForMultipleChoice,
-        MPNetForQuestionAnswering,
-        MPNetForSequenceClassification,
-        MPNetForTokenClassification,
-        MPNetModel,
-    )
-
-
-class MPNetModelTester:
-    """You can also import this e.g from .test_modeling_mpnet import MPNetModelTester"""
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=64,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=64,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def get_large_model_config(self):
-        return MPNetConfig.from_pretrained("microsoft/mpnet-base")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-        return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return MPNetConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_mpnet_model(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MPNetModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_mpnet_for_question_answering(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MPNetForQuestionAnswering(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_mpnet_for_sequence_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = MPNetForSequenceClassification(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_mpnet_for_multiple_choice(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = MPNetForMultipleChoice(config=config)
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_mpnet_for_token_classification(
-        self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = MPNetForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class MPNetModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            MPNetForMaskedLM,
-            MPNetForMultipleChoice,
-            MPNetForQuestionAnswering,
-            MPNetForSequenceClassification,
-            MPNetForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": MPNetModel,
-            "fill-mask": MPNetForMaskedLM,
-            "question-answering": MPNetForQuestionAnswering,
-            "text-classification": MPNetForSequenceClassification,
-            "token-classification": MPNetForTokenClassification,
-            "zero-shot": MPNetForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_resize_embeddings = True
-
-    def setUp(self):
-        self.model_tester = MPNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MPNetConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_mpnet_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpnet_model(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpnet_for_sequence_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpnet_for_multiple_choice(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpnet_for_token_classification(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpnet_for_question_answering(*config_and_inputs)
-
-
-@require_mindspore
-class MPNetModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head(self):
-        model = MPNetModel.from_pretrained("microsoft/mpnet-base")
-        input_ids = mindspore.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = ((1, 11, 768))
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[[-0.0550, 0.1943, -0.0740], [-0.0562, 0.2211, -0.0579], [-0.0437, 0.3337, -0.0641]]]
-        )
-        # compare the actual values for a slice.
-        self.assertTrue(np.allclose(output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/mpt/__init__.py b/tests/transformers/models/mpt/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mpt/test_modeling_mpt.py b/tests/transformers/models/mpt/test_modeling_mpt.py
deleted file mode 100644
index 53637bdbb..000000000
--- a/tests/transformers/models/mpt/test_modeling_mpt.py
+++ /dev/null
@@ -1,503 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import math
-import unittest
-import numpy as np
-
-from mindnlp.transformers import MptConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        AutoTokenizer,
-        MptForCausalLM,
-        MptForQuestionAnswering,
-        MptForSequenceClassification,
-        MptForTokenClassification,
-        MptModel,
-    )
-
-
-@require_mindspore
-class MptModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_token_type_ids=False,
-        use_input_mask=True,
-        use_labels=True,
-        use_mc_token_ids=True,
-        vocab_size=99,
-        hidden_size=48,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.use_mc_token_ids = use_mc_token_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_dropout_prob = attention_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = None
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-    def get_large_model_config(self):
-        return MptConfig.from_pretrained("mosaicml/mpt-7b")
-
-    def prepare_config_and_inputs(self, gradient_checkpointing=False):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config(gradient_checkpointing=gradient_checkpointing)
-
-        return (config, input_ids, input_mask, sequence_labels)
-
-    def get_config(self, gradient_checkpointing=False):
-        return MptConfig(
-            vocab_size=self.vocab_size,
-            seq_length=self.seq_length,
-            hidden_size=self.hidden_size,
-            n_layers=self.num_hidden_layers,
-            n_heads=self.num_attention_heads,
-            hidden_dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            num_labels=self.num_labels,
-            gradient_checkpointing=gradient_checkpointing,
-            dtype="float32",
-        )
-
-    def create_and_check_mpt_model(self, config, input_ids, input_mask, *args):
-        model = MptModel(config=config)
-        model.set_train(False)
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(len(result.past_key_values), config.n_layers)
-
-    def create_and_check_mpt_model_past(self, config, input_ids, input_mask, *args):
-        model = MptModel(config=config)
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=ops.ones_like(input_ids), use_cache=True)
-        outputs_use_cache_conf = model(input_ids, attention_mask=ops.ones_like(input_ids))
-        outputs_no_past = model(input_ids, use_cache=False, attention_mask=ops.ones_like(input_ids))
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_mpt_model_attention_mask_past(self, config, input_ids, input_mask, *args):
-        model = MptModel(config=config)
-        model.set_train(False)
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-        half_seq_length = self.seq_length // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            axis=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_mpt_model_past_large_inputs(self, config, input_ids, input_mask, *args):
-        model = MptModel(config=config)
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            output_hidden_states=True,
-        )
-        hidden_states_from_no_past = output_from_no_past["hidden_states"][0]
-
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )
-        hidden_states_from_past = output_from_past["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), hidden_states_from_past.shape[-1]).item()
-        output_from_no_past_slice = hidden_states_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = hidden_states_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_lm_head_model(self, config, input_ids, input_mask, *args):
-        model = MptForCausalLM(config)
-        model.set_train(False)
-
-        result = model(input_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_sequence_classification_model(self, config, input_ids, input_mask, *args):
-        config.num_labels = self.num_labels
-        model = MptForSequenceClassification(config)
-        model.set_train(False)
-
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_token_classification_model(self, config, input_ids, input_mask, *args):
-        model = MptForTokenClassification(config)
-        model.set_train(False)
-
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_question_answering_model(self, config, input_ids, input_mask, *args):
-        model = MptForQuestionAnswering(config)
-        model.set_train(False)
-
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_forward_and_backwards(
-        self, config, input_ids, input_mask, *args, gradient_checkpointing=False
-    ):
-        model = MptForCausalLM(config)
-
-        result = model(input_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_mpt_weight_initialization(self, config, *args):
-        model = MptModel(config)
-        model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layers)
-        for key in model.parameters_dict().keys():
-            if "c_proj" in key and "weight" in key:
-                self.parent.assertLessEqual(abs(ops.std(model.parameters_dict()[key]) - model_std), 0.001)
-                self.parent.assertLessEqual(abs(ops.mean(model.parameters_dict()[key]) - 0.0), 0.01)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        config, input_ids, input_mask, sequence_labels = config_and_inputs
-
-        inputs_dict = {"input_ids": input_ids}
-
-        return config, inputs_dict
-
-
-class MptConfigTester(ConfigTester):
-    def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs):
-        super().__init__(parent, config_class, has_text_modality, common_properties, **kwargs)
-
-    def test_attn_config_as_dict(self):
-        config = self.config_class(**self.inputs_dict, attn_config={"attn_impl": "flash", "softmax_scale": None})
-        self.parent.assertTrue(config.attn_config.attn_impl == "flash")
-        self.parent.assertTrue(config.attn_config.softmax_scale is None)
-
-    def run_common_tests(self):
-        self.test_attn_config_as_dict()
-        return super().run_common_tests()
-
-
-@require_mindspore
-class MptModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            MptForCausalLM,
-            MptForSequenceClassification,
-            MptForTokenClassification,
-            MptForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-
-    all_generative_model_classes = (MptForCausalLM,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_missing_keys = False
-    test_pruning = False
-    test_torchscript = False
-    test_head_masking = False
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": MptModel,
-            "question-answering": MptForQuestionAnswering,
-            "text-classification": MptForSequenceClassification,
-            "text-generation": MptForCausalLM,
-            "token-classification": MptForTokenClassification,
-            "zero-shot": MptForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    def setUp(self):
-        self.model_tester = MptModelTester(self)
-        self.config_tester = MptConfigTester(self, config_class=MptConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_mpt_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpt_model(*config_and_inputs)
-
-    def test_mpt_model_alibi_tensor(self):
-        # test creation of alibi tensor when num heads is not a power of two
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config_and_inputs[0].n_heads = 6
-        self.model_tester.create_and_check_mpt_model(*config_and_inputs)
-
-    def test_mpt_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpt_model_past(*config_and_inputs)
-
-    def test_mpt_model_att_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpt_model_attention_mask_past(*config_and_inputs)
-
-    def test_mpt_model_past_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpt_model_past_large_inputs(*config_and_inputs)
-
-    def test_mpt_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    def test_mpt_sequence_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_sequence_classification_model(*config_and_inputs)
-
-    def test_mpt_token_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_token_classification_model(*config_and_inputs)
-
-    def test_mpt_gradient_checkpointing(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
-
-    def test_mpt_weight_initialization(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_mpt_weight_initialization(*config_and_inputs)
-
-    @unittest.skip("For backward compatibility the lm_head is not in the model's state dict on the Hub.")
-    def test_model_weights_reload_no_missing_tied_weights(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "mosaicml/mpt-7b"
-        model = MptModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@slow
-class MptIntegrationTests(unittest.TestCase):
-    def test_generation_8k(self):
-        model_id = "mosaicml/mpt-7b-8k"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-        # Load in 4bit to fit the daily CI runner GPU RAM
-        model = MptForCausalLM.from_pretrained(
-            model_id, torch_dtype=mindspore.float16
-        )
-
-        input_text = "Hello"
-        expected_output = 'Hello, I\'m a new user of the forum. I have a question about the "Safety"'
-
-        inputs = tokenizer(input_text, return_tensors="ms")
-        outputs = model.generate(**inputs, max_new_tokens=20)
-
-        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        self.assertEqual(decoded_output, expected_output)
-
-    def test_generation(self):
-        model_id = "mosaicml/mpt-7b"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-        # Load in 4bit to fit the daily CI runner GPU RAM
-        model = MptForCausalLM.from_pretrained(
-            model_id, torch_dtype=mindspore.float16
-        )
-
-        input_text = "Hello"
-        expected_output = (
-            "Hello and welcome to the first day of the new release countdown for the month of May!\nToday"
-        )
-
-        inputs = tokenizer(input_text, return_tensors="ms")
-        outputs = model.generate(**inputs, max_new_tokens=20)
-
-        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        self.assertEqual(decoded_output, expected_output)
-
-    def test_generation_batched(self):
-        model_id = "mosaicml/mpt-7b"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-        # Load in 4bit to fit the daily CI runner GPU RAM
-        model = MptForCausalLM.from_pretrained(
-            model_id, torch_dtype=mindspore.float16
-        )
-
-        input_texts = ["Hello my name is", "Today I am going at the gym and"]
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-        tokenizer.padding_side = "left"
-
-        inputs = tokenizer(input_texts, return_tensors="ms", padding=True)
-
-        expected_output = [
-            "Hello my name is Tiffany and I am a mother of two beautiful children. I have been a nanny for over",
-            "Today I am going at the gym and then I am going to go to the grocery store and get some food. I am going to make",
-        ]
-        outputs = model.generate(**inputs, max_new_tokens=20)
-
-        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        for i, predicted_output in enumerate(decoded_outputs):
-            self.assertEqual(predicted_output, expected_output[i])
-
-    def test_model_logits(self):
-        model_id = "mosaicml/mpt-7b"
-
-        # Load in 4bit to fit the daily CI runner GPU RAM
-        model = MptForCausalLM.from_pretrained(
-            model_id, torch_dtype=mindspore.float16
-        )
-
-        dummy_input = mindspore.Tensor([[1, 2, 3, 4, 5]])
-
-        outputs = model(dummy_input, output_hidden_states=True)
-
-        expected_slice = mindspore.Tensor([-0.2539, -0.2178, -0.1953]).to(mindspore.float16)
-        predicted_slice = outputs.hidden_states[-1][0, 0, :3]
-
-        self.assertTrue(np.allclose(expected_slice.asnumpy(), predicted_slice.asnumpy(), atol=1e-3, rtol=1e-3))
\ No newline at end of file
diff --git a/tests/transformers/models/mt5/__init__.py b/tests/transformers/models/mt5/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mt5/test_modeling_mt5.py b/tests/transformers/models/mt5/test_modeling_mt5.py
deleted file mode 100644
index 5a53fe3d1..000000000
--- a/tests/transformers/models/mt5/test_modeling_mt5.py
+++ /dev/null
@@ -1,949 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import os
-import pickle
-import tempfile
-import unittest
-
-from mindnlp.transformers import MT5Config
-from mindnlp.transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-    is_mindspore_available
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-    from mindnlp.engine import set_seed
-
-    from mindnlp.transformers import (
-        AutoModelForSeq2SeqLM,
-        AutoTokenizer,
-        MT5EncoderModel,
-        MT5ForConditionalGeneration,
-        MT5ForQuestionAnswering,
-        MT5ForSequenceClassification,
-        MT5ForTokenClassification,
-        MT5Model,
-    )
-
-
-# Copied from tests.models.t5.test_modeling_t5.T5ModelTester with T5->MT5
-class MT5ModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        decoder_seq_length=7,
-        # For common tests
-        is_training=True,
-        use_attention_mask=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        eos_token_id=1,
-        pad_token_id=0,
-        decoder_start_token_id=0,
-        scope=None,
-        decoder_layers=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.scope = None
-        self.decoder_layers = decoder_layers
-
-    def get_large_model_config(self):
-        return MT5Config.from_pretrained("google-t5/t5-base")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size).clamp(2)
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        decoder_attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        )
-
-    def get_pipeline_config(self):
-        return MT5Config(
-            vocab_size=166,  # t5 forces 100 extra tokens
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-        )
-
-    def get_config(self):
-        return MT5Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-        )
-
-    def check_prepare_lm_labels_via_shift_left(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = MT5Model(config=config)
-
-        model.eval()
-
-        # make sure that lm_labels are correctly padded from the right
-        lm_labels = lm_labels.masked_fill((lm_labels == self.decoder_start_token_id), self.eos_token_id)
-
-        # add casaul pad token mask
-        triangular_mask = ops.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
-        lm_labels = lm_labels.masked_fill(triangular_mask, self.pad_token_id)
-        decoder_input_ids = model._shift_right(lm_labels)
-
-        for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
-            # first item
-            self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
-            if i < decoder_input_ids_slice.shape[-1]:
-                if i < decoder_input_ids.shape[-1] - 1:
-                    # items before diagonal
-                    self.parent.assertListEqual(
-                        decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
-                    )
-                # pad items after diagonal
-                if i < decoder_input_ids.shape[-1] - 2:
-                    self.parent.assertListEqual(
-                        decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
-                    )
-            else:
-                # all items after square
-                self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = MT5Model(config=config)
-
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        decoder_output = result.last_hidden_state
-        decoder_past = result.past_key_values
-        encoder_output = result.encoder_last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-        self.parent.assertEqual(decoder_output.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size))
-        # There should be `num_layers` key value embeddings stored in decoder_past
-        self.parent.assertEqual(len(decoder_past), config.num_layers)
-        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
-        self.parent.assertEqual(len(decoder_past[0]), 4)
-
-    def create_and_check_with_lm_head(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = MT5ForConditionalGeneration(config=config).eval()
-        outputs = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            labels=lm_labels,
-        )
-        self.parent.assertEqual(len(outputs), 4)
-        self.parent.assertEqual(outputs["logits"].shape, (self.batch_size, self.decoder_seq_length, self.vocab_size))
-        self.parent.assertEqual(outputs["loss"].shape, ())
-
-    def create_and_check_with_sequence_classification_head(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        labels = mindspore.tensor([1] * self.batch_size, dtype=mindspore.int64)
-        model = MT5ForSequenceClassification(config=config).eval()
-        outputs = model(
-            input_ids=input_ids,
-            decoder_input_ids=input_ids,
-            labels=labels,
-        )
-        # self.parent.assertEqual(len(outputs), 4)
-        self.parent.assertEqual(outputs["logits"].shape, (self.batch_size, config.num_labels))
-        self.parent.assertEqual(outputs["loss"].shape, ())
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = MT5Model(config=config).get_decoder().eval()
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = MT5Model(config=config).get_decoder()
-
-        model.eval()
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = MT5Model(config=config).get_decoder().eval()
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_generate_with_past_key_values(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = MT5ForConditionalGeneration(config=config).eval()
-        set_seed(0)
-        output_without_past_cache = model.generate(
-            input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
-        )
-        set_seed(0)
-        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
-        self.parent.assertTrue(ops.all(output_with_past_cache == output_without_past_cache))
-
-    def create_and_check_model_fp16_forward(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = MT5Model(config=config).half().eval()
-        output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"]
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-    def create_and_check_encoder_decoder_shared_weights(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        for model_class in [MT5Model, MT5ForConditionalGeneration]:
-            set_seed(0)
-            model = model_class(config=config).eval()
-            # load state dict copies weights but does not tie them
-            model.encoder.load_state_dict(model.decoder.state_dict(), strict=False)
-
-            set_seed(0)
-            tied_config = copy.deepcopy(config)
-            tied_config.tie_encoder_decoder = True
-            tied_model = model_class(config=tied_config).eval()
-
-            model_result = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            tied_model_result = tied_model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            # check that models has less parameters
-            self.parent.assertLess(
-                sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
-            )
-            random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
-
-            # check that outputs are equal
-            self.parent.assertTrue(
-                ops.allclose(
-                    model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
-                )
-            )
-
-            # check that outputs after saving and loading are equal
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tied_model.save_pretrained(tmpdirname)
-                tied_model = model_class.from_pretrained(tmpdirname)
-                tied_model.eval()
-
-                # check that models has less parameters
-                self.parent.assertLess(
-                    sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
-                )
-                random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
-
-                tied_model_result = tied_model(
-                    input_ids=input_ids,
-                    decoder_input_ids=decoder_input_ids,
-                    attention_mask=attention_mask,
-                    decoder_attention_mask=decoder_attention_mask,
-                )
-
-                # check that outputs are equal
-                self.parent.assertTrue(
-                    ops.allclose(
-                        model_result[0][0, :, random_slice_idx],
-                        tied_model_result[0][0, :, random_slice_idx],
-                        atol=1e-4,
-                    )
-                )
-
-    def check_resize_embeddings_t5_v1_1(
-        self,
-        config,
-    ):
-        prev_vocab_size = config.vocab_size
-
-        config.tie_word_embeddings = False
-        model = MT5ForConditionalGeneration(config=config).eval()
-        model.resize_token_embeddings(prev_vocab_size - 10)
-
-        self.parent.assertEqual(model.get_input_embeddings().weight.shape[0], prev_vocab_size - 10)
-        self.parent.assertEqual(model.get_output_embeddings().weight.shape[0], prev_vocab_size - 10)
-        self.parent.assertEqual(model.config.vocab_size, prev_vocab_size - 10)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "use_cache": False,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-# Copied from tests.models.t5.test_modeling_t5.T5ModelTest with T5->MT5, google-t5/t5-small->google/mt5-small
-class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (MT5Model, MT5ForConditionalGeneration, MT5ForSequenceClassification, MT5ForQuestionAnswering)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (MT5ForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": MT5Model,
-            "question-answering": MT5ForQuestionAnswering,
-            "summarization": MT5ForConditionalGeneration,
-            "text-classification": MT5ForSequenceClassification,
-            "text2text-generation": MT5ForConditionalGeneration,
-            "translation": MT5ForConditionalGeneration,
-            "zero-shot": MT5ForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    all_parallelizable_model_classes = (MT5Model, MT5ForConditionalGeneration) if is_mindspore_available() else ()
-    fx_compatible = True
-    test_pruning = False
-    test_resize_embeddings = True
-    test_model_parallel = True
-    is_encoder_decoder = True
-    # The small MT5 model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = MT5ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MT5Config, d_model=37)
-
-    # `QAPipelineTests` is not working well with slow tokenizers (for some models) and we don't want to touch the file
-    # `src/transformers/data/processors/squad.py` (where this test fails for this model)
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_case_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if tokenizer_name is None:
-            return True
-        if pipeline_test_case_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
-            return True
-
-        return False
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_shift_right(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_v1_1(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        # check that gated gelu feed forward and different word embeddings work
-        config = config_and_inputs[0]
-        config.tie_word_embeddings = False
-        config.feed_forward_proj = "gated-gelu"
-        self.model_tester.create_and_check_model(config, *config_and_inputs[1:])
-
-    # MT5ForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (MT5Model, MT5ForConditionalGeneration, MT5ForQuestionAnswering):
-            model = model_class(config)
-    
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with no_grad():
-                model(**inputs)[0]
-
-    def test_config_and_model_silu_gated(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config = config_and_inputs[0]
-        config.feed_forward_proj = "gated-silu"
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_with_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
-
-    def test_with_sequence_classification_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_sequence_classification_head(*config_and_inputs)
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_past_with_attn_mask(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    def test_decoder_model_past_with_3d_attn_mask(self):
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = self.model_tester.prepare_config_and_inputs()
-
-        attention_mask = ids_tensor(
-            [self.model_tester.batch_size, self.model_tester.encoder_seq_length, self.model_tester.encoder_seq_length],
-            vocab_size=2,
-        )
-        decoder_attention_mask = ids_tensor(
-            [self.model_tester.batch_size, self.model_tester.decoder_seq_length, self.model_tester.decoder_seq_length],
-            vocab_size=2,
-        )
-
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        )
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_generate_with_past_key_values(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_generate_with_past_key_values(*config_and_inputs)
-
-    def test_encoder_decoder_shared_weights(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
-
-    def test_model_fp16_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
-
-    def test_v1_1_resize_embeddings(self):
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        self.model_tester.check_resize_embeddings_t5_v1_1(config)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/mt5-small"
-        model = MT5Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_generate_with_head_masking(self):
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config = config_and_inputs[0]
-        max_length = config_and_inputs[1].shape[-1] + 3
-        model = MT5ForConditionalGeneration(config).eval()
-
-
-        head_masking = {
-            "head_mask": ops.zeros(config.num_layers, config.num_heads),
-            "decoder_head_mask": ops.zeros(config.num_decoder_layers, config.num_heads),
-            "cross_attn_head_mask": ops.zeros(config.num_decoder_layers, config.num_heads),
-        }
-
-        for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
-            head_masks = {name: mask}
-            # Explicitly pass decoder_head_mask as it is required from MT5 model when head_mask specified
-            if name == "head_mask":
-                head_masks["decoder_head_mask"] = ops.ones(
-                    config.num_decoder_layers, config.num_heads
-                )
-
-            out = model.generate(
-                config_and_inputs[1],
-                num_beams=1,
-                max_length=max_length,
-                output_attentions=True,
-                return_dict_in_generate=True,
-                **head_masks,
-            )
-            # We check the state of decoder_attentions and cross_attentions just from the last step
-            attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
-            self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
-
-
-# Copied from tests.models.t5.test_modeling_t5.T5EncoderOnlyModelTester with T5->MT5
-class MT5EncoderOnlyModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        # For common tests
-        use_attention_mask=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        is_training=False,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        is_encoder_decoder=False,
-        eos_token_id=1,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        # For common tests
-        self.seq_length = self.encoder_seq_length
-        self.use_attention_mask = use_attention_mask
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.is_encoder_decoder = is_encoder_decoder
-        self.scope = None
-        self.is_training = is_training
-
-    def get_large_model_config(self):
-        return MT5Config.from_pretrained("google-t5/t5-base")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-
-        config = MT5Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-    ):
-        model = MT5EncoderModel(config=config)
-
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-        )
-        result = model(input_ids=input_ids)
-        encoder_output = result.last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-
-    def create_and_check_model_fp16_forward(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-    ):
-        model = MT5EncoderModel(config=config).half().eval()
-        output = model(input_ids, attention_mask=attention_mask)["last_hidden_state"]
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-    def create_and_check_with_token_classification_head(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-    ):
-        labels = mindspore.tensor([1] * self.seq_length * self.batch_size, dtype=mindspore.int64)
-        model = MT5ForTokenClassification(config=config).eval()
-        outputs = model(
-            input_ids=input_ids,
-            labels=labels,
-            attention_mask=attention_mask,
-        )
-        self.parent.assertEqual(outputs["logits"].shape, (self.batch_size, self.seq_length, config.num_labels))
-        self.parent.assertEqual(outputs["loss"].shape, ())
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-# Copied from tests.models.t5.test_modeling_t5.T5EncoderOnlyModelTest with T5->MT5
-class MT5EncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (MT5EncoderModel, MT5ForTokenClassification) if is_mindspore_available() else ()
-    test_pruning = False
-    test_resize_embeddings = False
-    test_model_parallel = True
-    pipeline_model_mapping = (
-        {
-            "token-classification": MT5ForTokenClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    all_parallelizable_model_classes = (MT5EncoderModel,) if is_mindspore_available() else ()
-
-    def setUp(self):
-        self.model_tester = MT5EncoderOnlyModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MT5Config, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_fp16_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
-
-    def test_with_token_classification_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_token_classification_head(*config_and_inputs)
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class MT5IntegrationTest(unittest.TestCase):
-    @slow
-    def test_small_integration_test(self):
-        """
-        For comparision run:
-        >>> import t5  # pip install t5==0.7.1
-        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
-
-        >>> path_to_mtf_small_mt5_checkpoint = '<fill_in>'
-        >>> path_to_mtf_small_mt5_spm_model_path = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_mt5_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_mt5_spm_model_path)
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small", return_dict=True)
-        tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="ms").input_ids
-        labels = tokenizer("Hi I am", return_tensors="ms").input_ids
-
-        loss = model(input_ids, labels=labels).loss
-        mtf_score = -(labels.shape[-1] * loss.item())
-
-        EXPECTED_SCORE = -84.9127
-        print(mtf_score)
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
\ No newline at end of file
diff --git a/tests/transformers/models/musicgen/__init__.py b/tests/transformers/models/musicgen/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/musicgen/test_modeling_musicgen.py b/tests/transformers/models/musicgen/test_modeling_musicgen.py
deleted file mode 100644
index 2f4beacfe..000000000
--- a/tests/transformers/models/musicgen/test_modeling_musicgen.py
+++ /dev/null
@@ -1,1297 +0,0 @@
-# coding=utf-8
-# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Musicgen model."""
-
-import copy
-import inspect
-import math
-import tempfile
-import unittest
-
-import numpy as np
-from parameterized import parameterized
-from pytest import mark
-
-from mindnlp.transformers import (
-    EncodecConfig,
-    MusicgenConfig,
-    MusicgenDecoderConfig,
-    MusicgenProcessor,
-    PretrainedConfig,
-    T5Config,
-)
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    require_mindspore,
-    slow,
-)
-from mindnlp.utils import cached_property
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad, optim
-    from mindnlp.engine import set_seed
-
-    from mindnlp.transformers import (
-        MusicgenForCausalLM,
-        MusicgenForConditionalGeneration,
-        MusicgenModel,
-    )
-    from mindnlp.transformers.generation import (
-        GenerateDecoderOnlyOutput,
-        GenerateEncoderDecoderOutput,
-    )
-
-
-def _config_zero_init(config):
-    configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
-            setattr(configs_no_init, key, 1e-10)
-        if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
-            no_init_subconfig = _config_zero_init(getattr(configs_no_init, key))
-            setattr(configs_no_init, key, no_init_subconfig)
-    return configs_no_init
-
-
-def prepare_musicgen_decoder_inputs_dict(
-    config,
-    input_ids,
-    attention_mask=None,
-    head_mask=None,
-    encoder_hidden_states=None,
-    encoder_attention_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.reshape(-1, config.num_codebooks, input_ids.shape[-1])[:, 0, :]
-        attention_mask = attention_mask.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(config.num_hidden_layers, config.num_attention_heads)
-    if encoder_attention_mask is None and encoder_hidden_states is not None:
-        encoder_attention_mask = ops.ones(encoder_hidden_states.shape[:2])
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones(config.num_hidden_layers, config.num_attention_heads)
-    return {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "encoder_hidden_states": encoder_hidden_states,
-        "encoder_attention_mask": encoder_attention_mask,
-        "head_mask": head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-class MusicgenDecoderTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=4,  # need batch_size != num_hidden_layers
-        seq_length=7,
-        is_training=True,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=100,
-        pad_token_id=99,
-        bos_token_id=99,
-        num_codebooks=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.num_codebooks = num_codebooks
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size * self.num_codebooks, self.seq_length], self.vocab_size)
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-
-        config = self.get_config()
-        inputs_dict = prepare_musicgen_decoder_inputs_dict(
-            config,
-            input_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        return config, inputs_dict
-
-    def get_config(self):
-        config = MusicgenDecoderConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            d_ff=self.intermediate_size,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.bos_token_id,
-            bos_token_id=self.bos_token_id,
-            num_codebooks=self.num_codebooks,
-            tie_word_embeddings=False,
-        )
-        return config
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-
-@require_mindspore
-class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (MusicgenModel, MusicgenForCausalLM) if is_mindspore_available() else ()
-    greedy_sample_model_classes = (
-        (MusicgenForCausalLM,) if is_mindspore_available() else ()
-    )  # we don't want to run all the generation tests, only a specific subset
-    pipeline_model_mapping = {}
-    test_pruning = False
-    test_resize_embeddings = False
-
-    def setUp(self):
-        self.model_tester = MusicgenDecoderTester(self)
-        self.config_tester = ConfigTester(self, config_class=MusicgenDecoderConfig, hidden_size=16)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # special case for labels
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            inputs_dict["labels"] = ops.zeros(
-                (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_codebooks),
-                dtype=mindspore.int64,
-            )
-        return inputs_dict
-
-    def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="model_tester.is_training is set to False")
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.use_cache = False
-        config.return_dict = True
-        model = MusicgenForCausalLM(config)
-
-        model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
-        model.train()
-
-        # Contrarily to the initial method, we don't unfreeze freezed parameters.
-        # Indeed, sinusoidal position embeddings have frozen weights that should stay frozen.
-
-        optimizer = optim.SGD(model.parameters(), lr=0.01)
-
-        inputs = self._prepare_for_class(inputs_dict, MusicgenForCausalLM, return_labels=True)
-        loss = model(**inputs).loss
-        loss.backward()
-        optimizer.step()
-
-        for k, v in model.named_parameters():
-            if v.requires_grad:
-                self.assertTrue(v.grad is not None, f"{k} in {MusicgenForCausalLM.__name__} has no gradient!")
-
-    # override since we have to compute the input embeddings over codebooks
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-
-            embed_tokens = model.get_input_embeddings()
-
-            input_ids = input_ids.reshape(-1, config.num_codebooks, input_ids.shape[-1])
-
-            inputs["inputs_embeds"] = sum(
-                [embed_tokens[codebook](input_ids[:, codebook]) for codebook in range(config.num_codebooks)]
-            )
-
-            with no_grad():
-                model(**inputs)[0]
-
-    # override since we have embeddings / LM heads over multiple codebooks
-    def test_model_get_set_embeddings(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            first_embed = model.get_input_embeddings()[0]
-            self.assertIsInstance(first_embed, nn.Embedding)
-            lm_heads = model.get_output_embeddings()
-            self.assertTrue(lm_heads is None or isinstance(lm_heads[0], nn.Linear))
-
-    @unittest.skip(reason="MusicGen does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(reason="MusicGen does not support all arguments tested")
-    def test_model_outputs_equivalence(self):
-        pass
-
-    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied")
-    def test_tie_model_weights(self):
-        pass
-
-    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied")
-    def test_tied_weights_keys(self):
-        pass
-
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict["input_ids"]
-
-        _ = inputs_dict.pop("attention_mask", None)
-        inputs_dict = {
-            k: v[:batch_size, ...]
-            for k, v in inputs_dict.items()
-            if "head_mask" not in k and isinstance(v, mindspore.Tensor)
-        }
-
-        # take max batch_size
-        sequence_length = input_ids.shape[-1]
-        input_ids = input_ids[: batch_size * config.num_codebooks, :]
-
-        attention_mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-        return config, input_ids, attention_mask, inputs_dict
-
-    @staticmethod
-    def _get_logits_processor_and_warper_kwargs(
-        input_length,
-        forced_bos_token_id=None,
-        forced_eos_token_id=None,
-    ):
-        process_kwargs = {}
-        warper_kwargs = {}
-        return process_kwargs, warper_kwargs
-
-    def test_greedy_generate_stereo_outputs(self):
-        for model_class in self.greedy_sample_model_classes:
-            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
-            config.audio_channels = 2
-            model = model_class(config).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-                inputs_dict={},
-            )
-
-            self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
-
-            self.assertNotIn(config.pad_token_id, output_generate)
-
-
-def prepare_musicgen_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-    labels=None,
-):
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.reshape(
-            -1, config.decoder.num_codebooks, decoder_input_ids.shape[-1]
-        )[:, 0, :]
-        decoder_attention_mask = decoder_attention_mask.ne(config.decoder.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(
-            config.text_encoder.num_hidden_layers, config.text_encoder.num_attention_heads
-        )
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones(
-            config.decoder.num_hidden_layers, config.decoder.num_attention_heads
-        )
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones(
-            config.decoder.num_hidden_layers, config.decoder.num_attention_heads
-        )
-    return {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "decoder_input_ids": decoder_input_ids,
-        "decoder_attention_mask": decoder_attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-        "labels": labels,
-    }
-
-
-class MusicgenTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=4,  # need batch_size != num_hidden_layers
-        seq_length=7,
-        is_training=True,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=100,
-        pad_token_id=99,
-        bos_token_id=99,
-        num_codebooks=4,
-        num_filters=4,
-        codebook_size=128,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.num_codebooks = num_codebooks
-        self.num_filters = num_filters
-        self.codebook_size = codebook_size
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        decoder_input_ids = ids_tensor([self.batch_size * self.num_codebooks, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-        inputs_dict = prepare_musicgen_inputs_dict(config, input_ids, decoder_input_ids=decoder_input_ids)
-        return config, inputs_dict
-
-    def get_config(self):
-        text_encoder_config = T5Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.intermediate_size,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_attention_heads,
-        )
-        audio_encoder_config = EncodecConfig(
-            hidden_size=self.vocab_size,
-            compress=1,
-            num_filters=self.num_filters,
-            codebook_size=self.codebook_size,
-            codebook_dim=self.vocab_size,
-        )
-        decoder_config = MusicgenDecoderConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            ffn_dim=self.intermediate_size,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.bos_token_id,
-            bos_token_id=self.bos_token_id,
-            num_codebooks=self.num_codebooks,
-            tie_word_embeddings=False,
-        )
-        config = MusicgenConfig.from_sub_models_config(text_encoder_config, audio_encoder_config, decoder_config)
-        return config
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-
-@require_mindspore
-class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (MusicgenForConditionalGeneration,) if is_mindspore_available() else ()
-    greedy_sample_model_classes = (MusicgenForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"text-to-audio": MusicgenForConditionalGeneration} if is_mindspore_available() else {}
-    test_pruning = False  # training is not supported yet for MusicGen
-    test_headmasking = False
-    test_resize_embeddings = False
-
-    def setUp(self):
-        self.model_tester = MusicgenTester(self)
-
-    # special case for labels
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            inputs_dict["labels"] = ops.zeros(
-                (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_codebooks),
-                dtype=mindspore.int64,
-            )
-        return inputs_dict
-
-    def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="model_tester.is_training is set to False")
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.use_cache = False
-            config.return_dict = True
-            model = model_class(config)
-
-            model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
-            model.train()
-
-            # The audio encoder weights are not used during the forward pass (only during the generate pass)
-            # So we need to freeze it to be able to train.
-            model.freeze_audio_encoder()
-
-            optimizer = optim.SGD(model.parameters(), lr=0.01)
-
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            loss.backward()
-            optimizer.step()
-
-            for k, v in model.named_parameters():
-                if v.requires_grad:
-                    self.assertTrue(v.grad is not None, f"{k} in {model_class.__name__} has no gradient!")
-
-    def _check_output_with_attentions(self, outputs, config, input_ids, decoder_input_ids):
-        text_encoder_config = config.text_encoder
-        decoder_config = config.decoder
-
-        encoder_attentions = outputs["encoder_attentions"]
-        self.assertEqual(len(encoder_attentions), text_encoder_config.num_hidden_layers)
-
-        self.assertEqual(
-            encoder_attentions[0].shape[-3:],
-            (text_encoder_config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]),
-        )
-
-        decoder_attentions = outputs["decoder_attentions"]
-        num_decoder_layers = decoder_config.num_hidden_layers
-        self.assertEqual(len(decoder_attentions), num_decoder_layers)
-
-        self.assertEqual(
-            decoder_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
-        )
-
-        cross_attentions = outputs["cross_attentions"]
-        self.assertEqual(len(cross_attentions), num_decoder_layers)
-
-        cross_attention_input_seq_len = decoder_input_ids.shape[-1]
-        self.assertEqual(
-            cross_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, cross_attention_input_seq_len, input_ids.shape[-1]),
-        )
-
-    def check_musicgen_model_output_attentions(
-        self,
-        model_class,
-        config,
-        input_ids,
-        attention_mask,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        model = model_class(config)
-        model.eval()
-
-        with no_grad():
-            outputs = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-                output_attentions=True,
-                **kwargs,
-            )
-        self._check_output_with_attentions(outputs, config, input_ids, decoder_input_ids)
-
-    def check_musicgen_model_output_attentions_from_config(
-        self,
-        model_class,
-        config,
-        input_ids,
-        attention_mask,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        # Similar to `check_musicgen_model_output_attentions`, but with `output_attentions` triggered from the
-        # config file. Contrarily to most models, changing the model's config won't work -- the defaults are loaded
-        # from the inner models' configurations.
-        config.output_attentions = True  # model config -> won't work
-
-        model = model_class(config)
-        model.eval()
-
-        with no_grad():
-            outputs = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-                **kwargs,
-            )
-        self.assertTrue(
-            all(key not in outputs for key in ["encoder_attentions", "decoder_attentions", "cross_attentions"])
-        )
-        config.text_encoder.output_attentions = True  # inner model config -> will work
-        config.audio_encoder.output_attentions = True
-        config.decoder.output_attentions = True
-
-        model = model_class(config)
-        model.eval()
-
-        with no_grad():
-            outputs = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-                **kwargs,
-            )
-        self._check_output_with_attentions(outputs, config, input_ids, decoder_input_ids)
-
-    # override since changing `output_attentions` from the top-level model config won't work
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            self.check_musicgen_model_output_attentions(model_class, config, **inputs_dict)
-            self.check_musicgen_model_output_attentions_from_config(model_class, config, **inputs_dict)
-
-    # override since we have a specific forward signature for musicgen
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_ids",
-                "attention_mask",
-                "input_values",
-                "padding_mask",
-                "decoder_input_ids",
-                "decoder_attention_mask",
-            ]
-            expected_arg_names.extend(
-                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                else ["encoder_outputs"]
-            )
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    # override since changing `gradient_checkpointing` from the top-level model config won't work
-    def test_gradient_checkpointing_backward_compatibility(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if not model_class.supports_gradient_checkpointing:
-                continue
-
-            config.text_encoder.gradient_checkpointing = True
-            config.audio_encoder.gradient_checkpointing = True
-            config.decoder.gradient_checkpointing = True
-            model = model_class(config)
-            self.assertTrue(model.is_gradient_checkpointing)
-
-    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied.")
-    def test_tie_model_weights(self):
-        pass
-
-    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied.")
-    def test_tied_model_weights_key_ignore(self):
-        pass
-
-    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied.")
-    def test_tied_weights_keys(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
-    # override since changing `output_hidden_states` from the top-level model config won't work
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states
-
-            expected_num_layers = self.model_tester.num_hidden_layers + 1
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            seq_length = self.model_tester.seq_length
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-            hidden_states = outputs.decoder_hidden_states
-            self.assertIsInstance(hidden_states, (list, tuple))
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.text_encoder.output_hidden_states = True
-            config.audio_encoder.output_hidden_states = True
-            config.decoder.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    # override since the conv layers and lstm's in encodec are exceptions
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = ["conv"]
-                ignore_init = ["lstm"]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif not any(x in name for x in ignore_init):
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # override since we have embeddings / LM heads over multiple codebooks
-    def test_model_get_set_embeddings(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), nn.Embedding)
-            lm_heads = model.get_output_embeddings()
-            self.assertTrue(lm_heads is None or isinstance(lm_heads[0], nn.Linear))
-
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict["input_ids"]
-
-        # take max batch_size
-        sequence_length = input_ids.shape[-1]
-        input_ids = input_ids[:batch_size, :]
-        attention_mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-
-        return config, input_ids, attention_mask
-
-    # override since the `input_ids` cannot be used as the `decoder_input_ids` for musicgen (input / outputs are
-    # different modalities -> different shapes)
-    def _greedy_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=False,
-            num_beams=1,
-            max_new_tokens=self.max_new_tokens,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            output_scores=output_scores,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            **model_kwargs,
-        )
-
-        return output_generate
-
-    # override since the `input_ids` cannot be used as the `decoder_input_ids` for musicgen (input / outputs are
-    # different modalities -> different shapes)
-    def _sample_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        num_return_sequences,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        set_seed(0)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=True,
-            num_beams=1,
-            max_new_tokens=self.max_new_tokens,
-            num_return_sequences=num_return_sequences,
-            output_scores=output_scores,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            **model_kwargs,
-        )
-
-        return output_generate
-
-    @staticmethod
-    def _get_logits_processor_and_warper_kwargs(
-        input_length,
-        forced_bos_token_id=None,
-        forced_eos_token_id=None,
-    ):
-        process_kwargs = {}
-        warper_kwargs = {}
-        return process_kwargs, warper_kwargs
-
-    def test_greedy_generate_dict_outputs(self):
-        for model_class in self.greedy_sample_model_classes:
-            # disable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            config.use_cache = False
-            model = model_class(config).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
-
-            self.assertNotIn(config.pad_token_id, output_generate)
-
-    def test_greedy_generate_dict_outputs_use_cache(self):
-        for model_class in self.greedy_sample_model_classes:
-            # enable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-
-            config.use_cache = True
-            config.is_decoder = True
-            model = model_class(config).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
-
-    def test_sample_generate(self):
-        for model_class in self.greedy_sample_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            model = model_class(config).eval()
-
-            # check `generate()` and `sample()` are equal
-            output_generate = self._sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                num_return_sequences=1,
-            )
-            self.assertIsInstance(output_generate, mindspore.Tensor)
-
-    def test_sample_generate_dict_output(self):
-        for model_class in self.greedy_sample_model_classes:
-            # disable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            config.use_cache = False
-            model = model_class(config).eval()
-
-            output_generate = self._sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                num_return_sequences=3,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
-
-    def test_generate_without_input_ids(self):
-        config, _, _ = self._get_input_ids_and_config()
-
-        # if no bos token id => cannot generate from None
-        if config.bos_token_id is None:
-            self.skipTest(reason="bos_token_id is None")
-
-        for model_class in self.greedy_sample_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            output_ids_generate = model.generate(
-                do_sample=False, max_new_tokens=self.max_new_tokens, remove_invalid_values=True
-            )
-            self.assertIsNotNone(output_ids_generate)
-
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-
-        for model_class in self.greedy_sample_model_classes:
-            model = model_class(config).eval()
-            model.half()
-            # greedy
-            model.generate(input_dict["input_ids"], attention_mask=input_dict["attention_mask"], max_new_tokens=10)
-            # sampling
-            model.generate(
-                input_dict["input_ids"], attention_mask=input_dict["attention_mask"], do_sample=True, max_new_tokens=10
-            )
-
-    def test_greedy_generate_stereo_outputs(self):
-        for model_class in self.greedy_sample_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            config.audio_channels = 2
-
-            model = model_class(config).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
-
-            self.assertNotIn(config.pad_token_id, output_generate)
-
-    @unittest.skip(
-        reason="MusicgenModel is actually not the base of MusicgenForCausalLM as the latter is a composit model"
-    )
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    def test_requires_grad_with_frozen_encoders(self):
-        config = self.model_tester.get_config()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.freeze_audio_encoder()
-
-            audio_encoder_grads = [param.requires_grad for param in model.audio_encoder.parameters()]
-            text_encoder_grads = [param.requires_grad for param in model.text_encoder.parameters()]
-
-            self.assertFalse(all(audio_encoder_grads))
-            self.assertTrue(all(text_encoder_grads))
-
-            model = model_class(config)
-            model.freeze_text_encoder()
-
-            audio_encoder_grads = [param.requires_grad for param in model.audio_encoder.parameters()]
-            text_encoder_grads = [param.requires_grad for param in model.text_encoder.parameters()]
-
-            self.assertTrue(all(audio_encoder_grads))
-            self.assertFalse(all(text_encoder_grads))
-
-
-def get_bip_bip(bip_duration=0.125, duration=0.5, sample_rate=32000):
-    """Produces a series of 'bip bip' sounds at a given frequency."""
-    timesteps = np.arange(int(duration * sample_rate)) / sample_rate
-    wav = np.cos(2 * math.pi * 440 * timesteps)
-    time_period = (timesteps % (2 * bip_duration)) / (2 * bip_duration)
-    envelope = time_period >= 0.5
-    return wav * envelope
-
-
-@require_mindspore
-class MusicgenIntegrationTests(unittest.TestCase):
-    @cached_property
-    def model(self):
-        return MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
-
-    @cached_property
-    def processor(self):
-        return MusicgenProcessor.from_pretrained("facebook/musicgen-small")
-
-    @slow
-    def test_logits_text_prompt(self):
-        model = self.model
-        processor = self.processor
-
-        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="ms")
-
-        # prepare the encoder inputs
-        input_ids = inputs.input_ids
-        attention_mask = inputs.attention_mask
-
-        # prepare the decoder inputs
-        pad_token_id = model.generation_config.pad_token_id
-        decoder_input_ids = (
-            ops.ones((input_ids.shape[0] * model.decoder.num_codebooks, 1), dtype=mindspore.int64)
-            * pad_token_id
-        )
-
-        with no_grad():
-            logits = model(
-                input_ids,
-                attention_mask=attention_mask,
-                decoder_input_ids=decoder_input_ids,
-            ).logits
-
-        # fmt: off
-        EXPECTED_LOGITS = mindspore.tensor(
-            [
-                -0.9708, -3.0149, -4.6415, -1.4754, -0.2786, -2.3523, -2.6049, -6.7467,
-                -1.0206, -3.2984, -3.3968, -1.5108, -1.5786, -3.1493, -1.1503, -0.0545,
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(logits.shape == (*decoder_input_ids.shape, model.decoder.config.vocab_size))
-        print(logits[0, 0, :16])
-        self.assertTrue(ops.allclose(logits[0, 0, :16], EXPECTED_LOGITS, atol=1e-4))
-
-    @slow
-    def test_logits_text_audio_prompt(self):
-        model = self.model
-        processor = self.processor
-
-        audio = [get_bip_bip(duration=0.5), get_bip_bip(duration=1.0)]
-        text = ["80s music", "Club techno"]
-
-        inputs = processor(audio=audio, text=text, padding=True, return_tensors="ms")
-
-        # prepare the text encoder inputs
-        input_ids = inputs.input_ids
-        attention_mask = inputs.attention_mask
-
-        # prepare the audio encoder inputs
-        input_values = inputs.input_values
-        padding_mask = inputs.padding_mask
-
-        with no_grad():
-            logits = model(
-                input_ids,
-                attention_mask=attention_mask,
-                input_values=input_values,
-                padding_mask=padding_mask,
-            ).logits
-
-        # fmt: off
-        EXPECTED_LOGITS = mindspore.tensor(
-            [
-                0.1841, -2.9324, -0.7898, 0.1857, 0.4971, -2.8685, -1.6525, -1.6541,
-                2.7757, -2.5942, -3.0959, -1.0120, -1.0147, -0.4605, -0.8885, 0.6820,
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(logits.shape == (8, 50, 2048))
-        self.assertTrue(ops.allclose(logits[0, -1, :16], EXPECTED_LOGITS, atol=1e-4))
-
-    @slow
-    def test_generate_unconditional_greedy(self):
-        model = self.model
-
-        # only generate 1 sample with greedy - since it's deterministic all elements of the batch will be the same
-        unconditional_inputs = model.get_unconditional_inputs(num_samples=1)
-
-        output_values = model.generate(**unconditional_inputs, do_sample=False, max_new_tokens=5)
-
-        # fmt: off
-        EXPECTED_VALUES = mindspore.tensor(
-            [
-                0.0056, 0.0064, 0.0063, 0.0054, 0.0042, 0.0033, 0.0024, 0.0015,
-                0.0015, 0.0010, 0.0004, -0.0012, -0.0036, -0.0055, -0.0067, -0.0071,
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(output_values.shape == (1, 1, 3200))
-        self.assertTrue(ops.allclose(output_values[0, 0, :16], EXPECTED_VALUES, atol=1e-4))
-
-    @slow
-    def test_generate_unconditional_sampling(self):
-        model = self.model
-
-        # for stochastic sampling we can generate multiple outputs
-        unconditional_inputs = model.get_unconditional_inputs(num_samples=2)
-
-        set_seed(0)
-        output_values = model.generate(**unconditional_inputs, do_sample=True, max_new_tokens=10)
-
-        # fmt: off
-        EXPECTED_VALUES = mindspore.tensor(
-            [
-                -0.0099, -0.0140, 0.0079, 0.0080, -0.0046,  0.0065, -0.0068, -0.0185,
-                 0.0105,  0.0059, 0.0329, 0.0249, -0.0204, -0.0341, -0.0465,  0.0053,
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(output_values.shape == (2, 1, 4480))
-        self.assertTrue(ops.allclose(output_values[0, 0, :16], EXPECTED_VALUES, atol=1e-4))
-
-    @slow
-    def test_generate_text_prompt_greedy(self):
-        model = self.model
-        processor = self.processor
-
-        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="ms")
-
-        # prepare the encoder inputs
-        input_ids = inputs.input_ids
-        attention_mask = inputs.attention_mask
-
-        output_values = model.generate(
-            input_ids, attention_mask=attention_mask, do_sample=False, guidance_scale=None, max_new_tokens=10
-        )
-
-        # fmt: off
-        EXPECTED_VALUES = mindspore.tensor(
-            [
-                -1.1998e-04, -2.2302e-04, 4.6296e-04, 1.0524e-03, 2.4827e-04,
-                -4.0288e-05, -1.2468e-04, 4.9846e-05, 7.1485e-04, 4.4197e-04,
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(output_values.shape == (2, 1, 4480))
-        self.assertTrue(ops.allclose(output_values[0, 0, :10], EXPECTED_VALUES, atol=1e-4))
-
-    @slow
-    def test_generate_text_prompt_greedy_with_classifier_free_guidance(self):
-        model = self.model
-        processor = self.processor
-
-        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="ms")
-
-        # prepare the encoder inputs
-        input_ids = inputs.input_ids
-        attention_mask = inputs.attention_mask
-
-        output_values = model.generate(
-            input_ids, attention_mask=attention_mask, do_sample=False, guidance_scale=3, max_new_tokens=10
-        )
-
-        # fmt: off
-        EXPECTED_VALUES = mindspore.tensor(
-            [
-                0.0283, 0.0246, 0.0650, 0.0640, 0.0599, 0.0711, 0.0420, 0.0112,
-                0.0511, 0.0746, 0.1363, 0.1213, 0.0185, -0.0578, -0.0908, 0.0443,
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(output_values.shape == (2, 1, 4480))
-        self.assertTrue(ops.allclose(output_values[0, 0, :16], EXPECTED_VALUES, atol=1e-4))
-
-    @slow
-    def test_generate_text_prompt_sampling(self):
-        model = self.model
-        processor = self.processor
-
-        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="ms")
-
-        # prepare the encoder inputs
-        input_ids = inputs.input_ids
-        attention_mask = inputs.attention_mask
-
-        set_seed(0)
-        output_values = model.generate(
-            input_ids, attention_mask=attention_mask, do_sample=True, guidance_scale=None, max_new_tokens=10
-        )
-
-        # fmt: off
-        EXPECTED_VALUES = mindspore.tensor(
-            [
-                -0.0111, -0.0154, 0.0047, 0.0058, -0.0068,  0.0012, -0.0109, -0.0229,
-                 0.0010, -0.0038, 0.0167, 0.0042, -0.0421, -0.0610, -0.0764, -0.0326,
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(output_values.shape == (2, 1, 4480))
-        self.assertTrue(ops.allclose(output_values[0, 0, :16], EXPECTED_VALUES, atol=1e-4))
-
-    @slow
-    def test_generate_text_audio_prompt(self):
-        model = self.model
-        processor = self.processor
-
-        audio = [get_bip_bip(duration=0.5), get_bip_bip(duration=1.0)]
-        text = ["80s music", "Club techno"]
-
-        inputs = processor(audio=audio, text=text, padding=True, return_tensors="ms")
-
-        output_values = model.generate(**inputs, do_sample=False, guidance_scale=None, max_new_tokens=10)
-
-        # fmt: off
-        EXPECTED_VALUES = mindspore.tensor(
-            [
-                -0.0036, -0.0130, -0.0261, -0.0384, -0.0557, -0.0718, -0.0680, -0.0632,
-                -0.0529, -0.0403, -0.0289, -0.0198, -0.0136, -0.0101, -0.0095, -0.0040,
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(
-            output_values.shape == (2, 1, 36480)
-        )  # input values take shape 32000 and we generate from there
-        self.assertTrue(ops.allclose(output_values[0, 0, -16:], EXPECTED_VALUES, atol=1e-4))
-
-
-@require_mindspore
-class MusicgenStereoIntegrationTests(unittest.TestCase):
-    @cached_property
-    def model(self):
-        return MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-stereo-small")
-
-    @cached_property
-    def processor(self):
-        return MusicgenProcessor.from_pretrained("facebook/musicgen-stereo-small")
-
-    @slow
-    def test_generate_unconditional_greedy(self):
-        model = self.model
-
-        # only generate 1 sample with greedy - since it's deterministic all elements of the batch will be the same
-        unconditional_inputs = model.get_unconditional_inputs(num_samples=1)
-
-        output_values = model.generate(**unconditional_inputs, do_sample=False, max_new_tokens=12)
-
-        # fmt: off
-        EXPECTED_VALUES_LEFT = mindspore.tensor(
-            [
-                 0.0017,  0.0004,  0.0004,  0.0005,  0.0002,  0.0002, -0.0002, -0.0013,
-                -0.0010, -0.0015, -0.0018, -0.0032, -0.0060, -0.0082, -0.0096, -0.0099,
-            ]
-        )
-        EXPECTED_VALUES_RIGHT = mindspore.tensor(
-            [
-                0.0038, 0.0028, 0.0031,  0.0032,  0.0031,  0.0032,  0.0030,  0.0019,
-                0.0021, 0.0015, 0.0009, -0.0008, -0.0040, -0.0067, -0.0087, -0.0096,
-            ]
-        )
-        # fmt: on
-
-        # (bsz, channels, seq_len)
-        self.assertTrue(output_values.shape == (1, 2, 5760))
-        self.assertTrue(ops.allclose(output_values[0, 0, :16], EXPECTED_VALUES_LEFT, atol=1e-4))
-        self.assertTrue(ops.allclose(output_values[0, 1, :16], EXPECTED_VALUES_RIGHT, atol=1e-4))
-
-    @slow
-    def test_generate_text_audio_prompt(self):
-        model = self.model
-        processor = self.processor
-
-        # create stereo inputs
-        audio = [get_bip_bip(duration=0.5)[None, :].repeat(2, 0), get_bip_bip(duration=1.0)[None, :].repeat(2, 0)]
-        text = ["80s music", "Club techno"]
-
-        inputs = processor(audio=audio, text=text, padding=True, return_tensors="ms")
-
-        output_values = model.generate(**inputs, do_sample=False, guidance_scale=3.0, max_new_tokens=12)
-
-        # fmt: off
-        EXPECTED_VALUES_LEFT = mindspore.tensor(
-            [
-                 0.2535,  0.2008,  0.1471,  0.0896,  0.0306, -0.0200, -0.0501, -0.0728,
-                -0.0832, -0.0856, -0.0867, -0.0884, -0.0864, -0.0866, -0.0744, -0.0430,
-            ]
-        )
-        EXPECTED_VALUES_RIGHT = mindspore.tensor(
-            [
-                 0.1695,  0.1213,  0.0732,  0.0239, -0.0264, -0.0705, -0.0935, -0.1103,
-                -0.1163, -0.1139, -0.1104, -0.1082, -0.1027, -0.1004, -0.0900, -0.0614,
-            ]
-        )
-        # fmt: on
-
-        # (bsz, channels, seq_len)
-        self.assertTrue(output_values.shape == (2, 2, 37760))
-        # input values take shape 32000 and we generate from there - we check the last (generated) values
-        self.assertTrue(ops.allclose(output_values[0, 0, -16:], EXPECTED_VALUES_LEFT, atol=1e-4))
-        self.assertTrue(ops.allclose(output_values[0, 1, -16:], EXPECTED_VALUES_RIGHT, atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/musicgen_melody/__init__.py b/tests/transformers/models/musicgen_melody/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/musicgen_melody/test_feature_extraction_musicgen_melody.py b/tests/transformers/models/musicgen_melody/test_feature_extraction_musicgen_melody.py
deleted file mode 100644
index 51f3c23fb..000000000
--- a/tests/transformers/models/musicgen_melody/test_feature_extraction_musicgen_melody.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import itertools
-import math
-import os
-import random
-import tempfile
-import unittest
-
-import numpy as np
-
-from mindnlp.utils.testing_utils import (
-    check_json_file_has_correct_format,
-    require_mindspore,
-)
-
-from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
-
-
-import mindspore
-from mindspore import ops
-
-from mindnlp.transformers import MusicgenMelodyFeatureExtractor
-
-
-global_rng = random.Random()
-
-
-# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-
-# Copied from tests.models.musicgen.test_modeling_musicgen.get_bip_bip
-def get_bip_bip(bip_duration=0.125, duration=0.5, sample_rate=32000):
-    """Produces a series of 'bip bip' sounds at a given frequency."""
-    timesteps = np.arange(int(duration * sample_rate)) / sample_rate
-    wav = np.cos(2 * math.pi * 440 * timesteps)
-    time_period = (timesteps % (2 * bip_duration)) / (2 * bip_duration)
-    envelope = time_period >= 0.5
-    return wav * envelope
-
-
-@require_mindspore
-class MusicgenMelodyFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        min_seq_length=400,
-        max_seq_length=2000,
-        feature_size=12,
-        padding_value=0.0,
-        sampling_rate=4_000,
-        return_attention_mask=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.min_seq_length = min_seq_length
-        self.max_seq_length = max_seq_length
-        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
-        self.padding_value = padding_value
-        self.sampling_rate = sampling_rate
-        self.return_attention_mask = return_attention_mask
-        self.feature_size = feature_size
-        self.num_chroma = feature_size
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "feature_size": self.feature_size,
-            "padding_value": self.padding_value,
-            "sampling_rate": self.sampling_rate,
-            "return_attention_mask": self.return_attention_mask,
-        }
-
-    # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester.prepare_inputs_for_common
-    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
-        def _flatten(list_of_lists):
-            return list(itertools.chain(*list_of_lists))
-
-        if equal_length:
-            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
-        else:
-            # make sure that inputs increase in size
-            speech_inputs = [
-                floats_list((x, self.feature_size))
-                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
-            ]
-        if numpify:
-            speech_inputs = [np.asarray(x) for x in speech_inputs]
-        return speech_inputs
-
-
-@require_mindspore
-class MusicgenMelodyFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = MusicgenMelodyFeatureExtractor
-
-    def setUp(self):
-        self.feat_extract_tester = MusicgenMelodyFeatureExtractionTester(self)
-
-    # Copied from tests.models.seamless_m4t.test_feature_extraction_seamless_m4t.SeamlessM4TFeatureExtractionTest.test_feat_extract_from_and_save_pretrained
-    def test_feat_extract_from_and_save_pretrained(self):
-        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
-            check_json_file_has_correct_format(saved_file)
-            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
-
-        dict_first = feat_extract_first.to_dict()
-        dict_second = feat_extract_second.to_dict()
-        self.assertDictEqual(dict_first, dict_second)
-
-    # Copied from tests.models.seamless_m4t.test_feature_extraction_seamless_m4t.SeamlessM4TFeatureExtractionTest.test_feat_extract_to_json_file
-    def test_feat_extract_to_json_file(self):
-        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
-            feat_extract_first.to_json_file(json_file_path)
-            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
-
-        dict_first = feat_extract_first.to_dict()
-        dict_second = feat_extract_second.to_dict()
-        self.assertEqual(dict_first, dict_second)
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        # create three inputs of length 800, 1000, and 1200
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        # Test feature size
-        input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features
-        self.assertTrue(input_features.ndim == 3)
-        self.assertTrue(input_features.shape[0] == 3)
-        # Ignore copy
-        self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size)
-
-        # Test not batched input
-        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
-        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
-
-        # Test batched
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-        # Test 2-D numpy arrays are batched.
-        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
-        np_speech_inputs = np.asarray(speech_inputs)
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-    def test_call_from_demucs(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-
-        # (batch_size, num_stems, channel_size, audio_length)
-        inputs = ops.rand([4, 5, 2, 44000])
-
-        # Test feature size
-        input_features = feature_extractor(inputs, padding=True, return_tensors="np").input_features
-        self.assertTrue(input_features.ndim == 3)
-        self.assertTrue(input_features.shape[0] == 4)
-        self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size)
-
-        # Test single input
-        encoded_sequences_1 = feature_extractor(inputs[[0]], return_tensors="np").input_features
-        self.assertTrue(np.allclose(encoded_sequences_1[0], input_features[0], atol=1e-3))
-
-    # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_double_precision_pad with input_features->input_features
-    def test_double_precision_pad(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
-        py_speech_inputs = np_speech_inputs.tolist()
-
-        for inputs in [py_speech_inputs, np_speech_inputs]:
-            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
-            self.assertTrue(np_processed.input_features.dtype == np.float32)
-            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="ms")
-            self.assertTrue(pt_processed.input_features.dtype == mindspore.float32)
-
-    def test_integration(self):
-        EXPECTED_INPUT_FEATURES = ops.zeros((2, 8, 12))
-        EXPECTED_INPUT_FEATURES[0, :6, 9] = 1
-        EXPECTED_INPUT_FEATURES[0, 6:, 0] = 1
-        EXPECTED_INPUT_FEATURES[1, :, 9] = 1
-
-        input_speech = [get_bip_bip(duration=0.5), get_bip_bip(duration=1.0)]
-        feature_extractor = MusicgenMelodyFeatureExtractor()
-        input_features = feature_extractor(input_speech, return_tensors="ms").input_features
-
-        self.assertEqual(input_features.shape, (2, 8, 12))
-        self.assertTrue((input_features == EXPECTED_INPUT_FEATURES).all())
diff --git a/tests/transformers/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/transformers/models/musicgen_melody/test_modeling_musicgen_melody.py
deleted file mode 100644
index c4892e620..000000000
--- a/tests/transformers/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ /dev/null
@@ -1,1289 +0,0 @@
-# coding=utf-8
-# Copyright 2024, The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Musicgen Melody model."""
-
-import copy
-import inspect
-import math
-import tempfile
-import unittest
-
-import numpy as np
-from parameterized import parameterized
-from pytest import mark
-
-from mindnlp.transformers import (
-    EncodecConfig,
-    MusicgenMelodyConfig,
-    MusicgenMelodyDecoderConfig,
-    PretrainedConfig,
-    T5Config,
-)
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    require_mindspore,
-    require_mindspore_gpu,
-    slow,
-)
-from mindnlp.utils import cached_property
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, optim, no_grad
-
-    from mindnlp.transformers import (
-        MusicgenMelodyForCausalLM,
-        MusicgenMelodyForConditionalGeneration,
-        MusicgenMelodyModel,
-    )
-    from mindnlp.engine import set_seed
-    from mindnlp.transformers.generation import (
-        GenerateDecoderOnlyOutput,
-    )
-    from mindnlp.transformers import MusicgenMelodyProcessor
-
-
-def _config_zero_init(config):
-    configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
-            setattr(configs_no_init, key, 1e-10)
-        if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
-            no_init_subconfig = _config_zero_init(getattr(configs_no_init, key))
-            setattr(configs_no_init, key, no_init_subconfig)
-    return configs_no_init
-
-
-def prepare_musicgen_melody_decoder_inputs_dict(
-    config,
-    input_ids,
-    attention_mask=None,
-    head_mask=None,
-    encoder_hidden_states=None,
-    encoder_attention_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.reshape(-1, config.num_codebooks, input_ids.shape[-1])[:, 0, :]
-        attention_mask = attention_mask.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(config.num_hidden_layers, config.num_attention_heads)
-    if encoder_attention_mask is None and encoder_hidden_states is not None:
-        encoder_attention_mask = ops.ones(encoder_hidden_states.shape[:2])
-    return {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "encoder_hidden_states": encoder_hidden_states,
-        "encoder_attention_mask": encoder_attention_mask,
-        "head_mask": head_mask,
-    }
-
-
-class MusicgenMelodyDecoderTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,  # need batch_size != num_hidden_layers because of #29297
-        seq_length=7,
-        is_training=True,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=100,
-        pad_token_id=99,
-        bos_token_id=99,
-        num_codebooks=4,
-        conditional_seq_length=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.num_codebooks = num_codebooks
-        self.conditional_seq_length = conditional_seq_length
-        self.encoder_seq_length = conditional_seq_length + seq_length
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size * self.num_codebooks, self.seq_length], self.vocab_size)
-        encoder_hidden_states = floats_tensor([self.batch_size, self.conditional_seq_length, self.hidden_size])
-
-        config = self.get_config()
-        inputs_dict = prepare_musicgen_melody_decoder_inputs_dict(
-            config,
-            input_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        return config, inputs_dict
-
-    def get_config(self):
-        config = MusicgenMelodyDecoderConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            d_ff=self.intermediate_size,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.bos_token_id,
-            bos_token_id=self.bos_token_id,
-            num_codebooks=self.num_codebooks,
-            tie_word_embeddings=False,
-        )
-        return config
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-
-@require_mindspore
-class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (MusicgenMelodyModel, MusicgenMelodyForCausalLM) if is_mindspore_available() else ()
-    greedy_sample_model_classes = (
-        (MusicgenMelodyForCausalLM,) if is_mindspore_available() else ()
-    )  # the model uses a custom generation method so we only run a specific subset of the generation tests
-    test_pruning = False
-    test_resize_embeddings = False
-
-    def setUp(self):
-        self.model_tester = MusicgenMelodyDecoderTester(self)
-        self.config_tester = ConfigTester(self, config_class=MusicgenMelodyDecoderConfig, hidden_size=16)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # special case for labels
-    # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest._prepare_for_class
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            inputs_dict["labels"] = ops.zeros(
-                (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_codebooks),
-                dtype=mindspore.int64,
-            )
-        return inputs_dict
-
-    # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.check_training_gradient_checkpointing with Musicgen->MusicgenMelody
-    def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="model_tester.is_training is set to False")
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.use_cache = False
-        config.return_dict = True
-        model = MusicgenMelodyForCausalLM(config)
-
-        model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
-        model.train()
-
-        # Contrarily to the initial method, we don't unfreeze freezed parameters.
-        # Indeed, sinusoidal position embeddings have frozen weights that should stay frozen.
-
-        optimizer = optim.SGD(model.parameters(), lr=0.01)
-
-        inputs = self._prepare_for_class(inputs_dict, MusicgenMelodyForCausalLM, return_labels=True)
-        loss = model(**inputs).loss
-        loss.backward()
-        optimizer.step()
-
-        for k, v in model.named_parameters():
-            if v.requires_grad:
-                self.assertTrue(v.grad is not None, f"{k} in {MusicgenMelodyForCausalLM.__name__} has no gradient!")
-
-    # override since we have to compute the input embeddings over codebooks
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-
-            embed_tokens = model.get_input_embeddings()
-
-            input_ids = input_ids.reshape(-1, config.num_codebooks, input_ids.shape[-1])
-
-            inputs["inputs_embeds"] = sum(
-                [embed_tokens[codebook](input_ids[:, codebook]) for codebook in range(config.num_codebooks)]
-            )
-
-            with no_grad():
-                model(**inputs)[0]
-
-    # override since we have embeddings / LM heads over multiple codebooks
-    def test_model_get_set_embeddings(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            first_embed = model.get_input_embeddings()[0]
-            self.assertIsInstance(first_embed, nn.Embedding)
-            lm_heads = model.get_output_embeddings()
-            self.assertTrue(lm_heads is None or isinstance(lm_heads[0], nn.Linear))
-
-    @unittest.skip(reason="MusicGen melody does not use inputs_embeds")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(reason="this model doesn't support all arguments tested")
-    def test_model_outputs_equivalence(self):
-        pass
-
-    @unittest.skip(reason="this model has multiple inputs embeds and lm heads that should not be tied")
-    def test_tie_model_weights(self):
-        pass
-
-    @unittest.skip(reason="this model has multiple inputs embeds and lm heads that should not be tied")
-    def test_tied_weights_keys(self):
-        pass
-
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict["input_ids"]
-
-        _ = inputs_dict.pop("attention_mask", None)
-        inputs_dict = {
-            k: v[:batch_size, ...]
-            for k, v in inputs_dict.items()
-            if "head_mask" not in k and isinstance(v, mindspore.Tensor)
-        }
-
-        # take max batch_size
-        sequence_length = input_ids.shape[-1]
-        input_ids = input_ids[: batch_size * config.num_codebooks, :]
-
-        attention_mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-        return config, input_ids, attention_mask, inputs_dict
-
-    def _get_logits_processor_kwargs(self, do_sample=False, config=None):
-        logits_processor_kwargs = {}
-        return logits_processor_kwargs
-
-    def test_greedy_generate_stereo_outputs(self):
-        for model_class in self.greedy_sample_model_classes:
-            config, input_ids, attention_mask, _ = self._get_input_ids_and_config()
-            config.audio_channels = 2
-            model = model_class(config).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-                inputs_dict={},
-            )
-
-            self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
-            self.assertNotIn(config.pad_token_id, output_generate)
-
-
-def prepare_musicgen_melody_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    labels=None,
-):
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.reshape(
-            -1, config.decoder.num_codebooks, decoder_input_ids.shape[-1]
-        )[:, 0, :]
-        decoder_attention_mask = decoder_attention_mask.ne(config.decoder.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(
-            config.text_encoder.num_hidden_layers, config.text_encoder.num_attention_heads
-        )
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones(
-            config.decoder.num_hidden_layers, config.decoder.num_attention_heads
-        )
-    return {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "decoder_input_ids": decoder_input_ids,
-        "decoder_attention_mask": decoder_attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "labels": labels,
-    }
-
-
-class MusicgenMelodyTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,  # need batch_size != num_hidden_layers because of #29297
-        seq_length=7,
-        is_training=True,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=100,
-        pad_token_id=99,
-        bos_token_id=99,
-        num_codebooks=4,
-        num_filters=4,
-        codebook_size=128,
-        conditional_seq_length=3,
-        chroma_length=24,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.num_codebooks = num_codebooks
-        self.num_filters = num_filters
-        self.codebook_size = codebook_size
-        self.conditional_seq_length = conditional_seq_length
-        self.chroma_length = chroma_length
-        self.encoder_seq_length = conditional_seq_length + seq_length
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.conditional_seq_length], self.vocab_size)
-        decoder_input_ids = ids_tensor([self.batch_size * self.num_codebooks, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-        inputs_dict = prepare_musicgen_melody_inputs_dict(config, input_ids, decoder_input_ids=decoder_input_ids)
-        return config, inputs_dict
-
-    def get_config(self):
-        text_encoder_config = T5Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.intermediate_size,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_attention_heads,
-        )
-        audio_encoder_config = EncodecConfig(
-            hidden_size=self.vocab_size,
-            compress=1,
-            num_filters=self.num_filters,
-            codebook_size=self.codebook_size,
-            codebook_dim=self.vocab_size,
-        )
-        decoder_config = MusicgenMelodyDecoderConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            ffn_dim=self.intermediate_size,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.bos_token_id,
-            bos_token_id=self.bos_token_id,
-            num_codebooks=self.num_codebooks,
-            tie_word_embeddings=False,
-        )
-        config = MusicgenMelodyConfig.from_sub_models_config(
-            text_encoder_config, audio_encoder_config, decoder_config, chroma_length=self.chroma_length
-        )
-        return config
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-
-@require_mindspore
-# Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenTest with Musicgen->MusicgenMelody, musicgen->musicgen_melody, EncoderDecoder->DecoderOnly, input_values->input_features
-class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (MusicgenMelodyForConditionalGeneration,) if is_mindspore_available() else ()
-    greedy_sample_model_classes = (MusicgenMelodyForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"text-to-audio": MusicgenMelodyForConditionalGeneration} if is_mindspore_available() else {}
-    test_pruning = False  # training is not supported yet for MusicGen
-    test_headmasking = False
-    test_resize_embeddings = False
-    # not to test torchscript as the model tester doesn't prepare `input_features` and `padding_mask`
-    # (and `torchscript` hates `None` values).
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = MusicgenMelodyTester(self)
-
-    # special case for labels
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            inputs_dict["labels"] = ops.zeros(
-                (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_codebooks),
-                dtype=mindspore.int64,
-            )
-        return inputs_dict
-
-    def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="model_tester.is_training is set to False")
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.use_cache = False
-            config.return_dict = True
-            model = model_class(config)
-
-            model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
-            model.train()
-
-            # The audio encoder weights are not used during the forward pass (only during the generate pass)
-            # So we need to freeze it to be able to train.
-            model.freeze_audio_encoder()
-
-            optimizer = optim.SGD(model.parameters(), lr=0.01)
-
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            loss.backward()
-            optimizer.step()
-
-            for k, v in model.named_parameters():
-                if v.requires_grad:
-                    self.assertTrue(v.grad is not None, f"{k} in {model_class.__name__} has no gradient!")
-
-    # Ignore copy
-    def _check_output_with_attentions(self, outputs, config, input_ids, decoder_input_ids):
-        decoder_config = config.decoder
-
-        decoder_attentions = outputs["attentions"]
-        num_decoder_layers = decoder_config.num_hidden_layers
-        self.assertEqual(len(decoder_attentions), num_decoder_layers)
-
-        output_shape = decoder_input_ids.shape[-1] + input_ids.shape[-1] + self.model_tester.chroma_length
-        self.assertEqual(
-            decoder_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, output_shape, output_shape),
-        )
-
-    def check_musicgen_melody_model_output_attentions(
-        self,
-        model_class,
-        config,
-        input_ids,
-        attention_mask,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        model = model_class(config)
-        model.eval()
-
-        with no_grad():
-            outputs = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-                output_attentions=True,
-                **kwargs,
-            )
-        self._check_output_with_attentions(outputs, config, input_ids, decoder_input_ids)
-
-    # Ignore copy
-    def check_musicgen_melody_model_output_attentions_from_config(
-        self,
-        model_class,
-        config,
-        input_ids,
-        attention_mask,
-        decoder_input_ids,
-        decoder_attention_mask,
-        **kwargs,
-    ):
-        # Similar to `check_musicgen_melody_model_output_attentions`, but with `output_attentions` triggered from the
-        # config file. Contrarily to most models, changing the model's config won't work -- the defaults are loaded
-        # from the inner models' configurations.
-        config.output_attentions = True  # model config -> won't work
-
-        model = model_class(config)
-        model.eval()
-
-        with no_grad():
-            outputs = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-                **kwargs,
-            )
-        self.assertTrue(all(key not in outputs for key in ["encoder_attentions", "decoder_attentions"]))
-        config.text_encoder.output_attentions = True  # inner model config -> will work
-        config.audio_encoder.output_attentions = True
-        config.decoder.output_attentions = True
-
-        model = model_class(config)
-        model.eval()
-
-        with no_grad():
-            outputs = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-                **kwargs,
-            )
-        self._check_output_with_attentions(outputs, config, input_ids, decoder_input_ids)
-
-    # override since changing `output_attentions` from the top-level model config won't work
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            self.check_musicgen_melody_model_output_attentions(model_class, config, **inputs_dict)
-            self.check_musicgen_melody_model_output_attentions_from_config(model_class, config, **inputs_dict)
-
-    # override since we have a specific forward signature for musicgen_melody
-    # Ignore copy
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_ids",
-                "attention_mask",
-                "input_features",
-                "decoder_input_ids",
-                "decoder_attention_mask",
-            ]
-            if "head_mask" and "decoder_head_mask" in arg_names:
-                expected_arg_names.extend(["head_mask", "decoder_head_mask"])
-
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    # override since changing `gradient_checkpointing` from the top-level model config won't work
-    def test_gradient_checkpointing_backward_compatibility(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if not model_class.supports_gradient_checkpointing:
-                continue
-
-            config.text_encoder.gradient_checkpointing = True
-            config.audio_encoder.gradient_checkpointing = True
-            config.decoder.gradient_checkpointing = True
-            model = model_class(config)
-            self.assertTrue(model.is_gradient_checkpointing)
-
-    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied.")
-    def test_tie_model_weights(self):
-        pass
-
-    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied.")
-    def test_tied_model_weights_key_ignore(self):
-        pass
-
-    @unittest.skip(reason="MusicGen has multiple inputs embeds and lm heads that should not be tied.")
-    def test_tied_weights_keys(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
-    # override since changing `output_hidden_states` from the top-level model config won't work
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states
-
-            expected_num_layers = self.model_tester.num_hidden_layers + 1
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            # Ignore copy
-            seq_length = self.model_tester.conditional_seq_length + self.model_tester.chroma_length
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-            # Ignore copy
-            seq_length = self.model_tester.encoder_seq_length + self.model_tester.chroma_length
-            # Ignore copy
-            expected_num_layers = self.model_tester.num_hidden_layers + 1
-            # Ignore copy
-            hidden_states = outputs.hidden_states
-            self.assertIsInstance(hidden_states, (list, tuple))
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.text_encoder.output_hidden_states = True
-            config.audio_encoder.output_hidden_states = True
-            config.decoder.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    # override since the conv layers and lstm's in encodec are exceptions
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = ["conv"]
-                ignore_init = ["lstm"]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif not any(x in name for x in ignore_init):
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # override since we have embeddings / LM heads over multiple codebooks
-    def test_model_get_set_embeddings(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), nn.Embedding)
-            lm_heads = model.get_output_embeddings()
-            self.assertTrue(lm_heads is None or isinstance(lm_heads[0], nn.Linear))
-
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict["input_ids"]
-
-        # take max batch_size
-        sequence_length = input_ids.shape[-1]
-        input_ids = input_ids[:batch_size, :]
-        attention_mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-
-        return config, input_ids, attention_mask
-
-    # override since the `input_ids` cannot be used as the `decoder_input_ids` for musicgen_melody (input / outputs are
-    # different modalities -> different shapes)
-    def _greedy_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=False,
-            num_beams=1,
-            max_new_tokens=self.max_new_tokens,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            output_scores=output_scores,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            **model_kwargs,
-        )
-
-        return output_generate
-
-    # override since the `input_ids` cannot be used as the `decoder_input_ids` for musicgen_melody (input / outputs are
-    # different modalities -> different shapes)
-    def _sample_generate(
-        self,
-        model,
-        input_ids,
-        attention_mask,
-        num_return_sequences,
-        output_scores=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-    ):
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-        output_generate = model.generate(
-            input_ids,
-            do_sample=True,
-            num_beams=1,
-            max_new_tokens=self.max_new_tokens,
-            num_return_sequences=num_return_sequences,
-            output_scores=output_scores,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
-            **model_kwargs,
-        )
-
-        return output_generate
-
-    def _get_logits_processor_kwargs(self, do_sample=False, config=None):
-        logits_processor_kwargs = {}
-        return logits_processor_kwargs
-
-    def test_greedy_generate_dict_outputs(self):
-        for model_class in self.greedy_sample_model_classes:
-            # disable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            config.use_cache = False
-            model = model_class(config).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
-
-            self.assertNotIn(config.pad_token_id, output_generate)
-
-    def test_greedy_generate_dict_outputs_use_cache(self):
-        for model_class in self.greedy_sample_model_classes:
-            # enable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-
-            config.use_cache = True
-            config.is_decoder = True
-            model = model_class(config).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
-
-    def test_sample_generate(self):
-        for model_class in self.greedy_sample_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            model = model_class(config).eval()
-
-            # check `generate()` and `sample()` are equal
-            output_generate = self._sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                num_return_sequences=1,
-            )
-            self.assertIsInstance(output_generate, mindspore.Tensor)
-
-    def test_sample_generate_dict_output(self):
-        for model_class in self.greedy_sample_model_classes:
-            # disable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            config.use_cache = False
-            model = model_class(config).eval()
-
-            output_generate = self._sample_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                num_return_sequences=3,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
-
-    def test_generate_without_input_ids(self):
-        config, _, _ = self._get_input_ids_and_config()
-
-        # if no bos token id => cannot generate from None
-        if config.bos_token_id is None:
-            self.skipTest(reason="bos_token_id is None")
-
-        for model_class in self.greedy_sample_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            output_ids_generate = model.generate(
-                do_sample=False, max_new_tokens=self.max_new_tokens, remove_invalid_values=True
-            )
-            self.assertIsNotNone(output_ids_generate)
-
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-
-        for model_class in self.greedy_sample_model_classes:
-            model = model_class(config).eval()
-            model.half()
-            # greedy
-            model.generate(input_dict["input_ids"], attention_mask=input_dict["attention_mask"], max_new_tokens=10)
-            # sampling
-            model.generate(
-                input_dict["input_ids"], attention_mask=input_dict["attention_mask"], do_sample=True, max_new_tokens=10
-            )
-
-    def test_greedy_generate_stereo_outputs(self):
-        for model_class in self.greedy_sample_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
-            config.audio_channels = 2
-
-            model = model_class(config).eval()
-            output_generate = self._greedy_generate(
-                model=model,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
-
-            self.assertNotIn(config.pad_token_id, output_generate)
-
-    @unittest.skip(
-        reason="MusicgenMelodyModel is actually not the base of MusicgenMelodyForCausalLM as the latter is a composit model"
-    )
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    def test_requires_grad_with_frozen_encoders(self):
-        config = self.model_tester.get_config()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.freeze_audio_encoder()
-
-            audio_encoder_grads = [param.requires_grad for param in model.audio_encoder.parameters()]
-            text_encoder_grads = [param.requires_grad for param in model.text_encoder.parameters()]
-
-            self.assertFalse(all(audio_encoder_grads))
-            self.assertTrue(all(text_encoder_grads))
-
-            model = model_class(config)
-            model.freeze_text_encoder()
-
-            audio_encoder_grads = [param.requires_grad for param in model.audio_encoder.parameters()]
-            text_encoder_grads = [param.requires_grad for param in model.text_encoder.parameters()]
-
-            self.assertTrue(all(audio_encoder_grads))
-            self.assertFalse(all(text_encoder_grads))
-
-
-# Copied from tests.models.musicgen.test_modeling_musicgen.get_bip_bip
-def get_bip_bip(bip_duration=0.125, duration=0.5, sample_rate=32000):
-    """Produces a series of 'bip bip' sounds at a given frequency."""
-    timesteps = np.arange(int(duration * sample_rate)) / sample_rate
-    wav = np.cos(2 * math.pi * 440 * timesteps)
-    time_period = (timesteps % (2 * bip_duration)) / (2 * bip_duration)
-    envelope = time_period >= 0.5
-    return wav * envelope
-
-
-@require_mindspore
-class MusicgenMelodyIntegrationTests(unittest.TestCase):
-    @cached_property
-    def model(self):
-        return MusicgenMelodyForConditionalGeneration.from_pretrained("ylacombe/musicgen-melody")
-
-    @cached_property
-    def processor(self):
-        return MusicgenMelodyProcessor.from_pretrained("ylacombe/musicgen-melody")
-
-    @slow
-    def test_logits_text_prompt(self):
-        model = self.model
-        processor = self.processor
-
-        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="ms")
-
-        # prepare the encoder inputs
-        input_ids = inputs.input_ids
-        attention_mask = inputs.attention_mask
-
-        # prepare the decoder inputs
-        pad_token_id = model.generation_config.pad_token_id
-        decoder_input_ids = (
-            ops.ones((input_ids.shape[0] * model.decoder.num_codebooks, 1), dtype=mindspore.int64)
-            * pad_token_id
-        )
-
-        with no_grad():
-            logits = model(
-                input_ids,
-                attention_mask=attention_mask,
-                decoder_input_ids=decoder_input_ids,
-            ).logits
-
-        # fmt: off
-        EXPECTED_LOGITS = mindspore.tensor([
-            1.1100, -2.1065, -3.7699, -0.7102,  1.3707, -1.7028, -2.6802, -6.0367,
-            1.0504, -2.5358, -4.3497,  0.7338,  0.4823, -2.5260,  1.2717,  1.5427
-        ])
-        # fmt: on
-        EXPECTED_OUTPUT_LENGTH = input_ids.shape[1] + 1 + self.model.config.chroma_length
-
-        logits_shape = (
-            input_ids.shape[0] * model.decoder.num_codebooks,
-            EXPECTED_OUTPUT_LENGTH,
-            model.decoder.config.vocab_size,
-        )
-
-        self.assertTrue(logits.shape == logits_shape)
-        self.assertTrue(ops.allclose(logits[0, -1, :16].cpu(), EXPECTED_LOGITS, atol=1e-4))
-
-    @slow
-    def test_logits_text_audio_prompt(self):
-        model = self.model
-        processor = self.processor
-
-        audio = [get_bip_bip(duration=0.5), get_bip_bip(duration=1.0)]
-        text = ["80s music", "Club techno"]
-
-        inputs = processor(audio=audio, text=text, padding=True, return_tensors="ms")
-
-        # prepare the text encoder inputs
-        input_ids = inputs.input_ids
-        attention_mask = inputs.attention_mask
-
-        # prepare the audio encoder inputs
-        input_features = inputs.input_features
-
-        # prepare the decoder inputs
-        pad_token_id = model.generation_config.pad_token_id
-        decoder_input_ids = (
-            ops.ones((input_ids.shape[0] * model.decoder.num_codebooks, 1), dtype=mindspore.int64)
-            * pad_token_id
-        )
-
-        with no_grad():
-            logits = model(
-                input_ids,
-                attention_mask=attention_mask,
-                input_features=input_features,
-                decoder_input_ids=decoder_input_ids,
-            ).logits
-
-        # fmt: off
-        EXPECTED_LOGITS = mindspore.tensor([
-        [ 0.7479,  0.3742,  0.6253, -7.9405,  0.7105, -6.9995,  0.7792, -3.0482],
-        [-2.7905,  0.7492, -0.2556, -8.1586, -1.6740,  0.5771, -8.3650, -0.0908]
-        ])
-        # fmt: on
-
-        self.assertTrue(logits.shape == (8, 240, 2048))
-        self.assertTrue(ops.allclose(logits[1:3, -1, 32:40].cpu(), EXPECTED_LOGITS, atol=1e-4))
-
-    @slow
-    def test_generate_unconditional_greedy(self):
-        model = self.model
-
-        # only generate 1 sample with greedy - since it's deterministic all elements of the batch will be the same
-        unconditional_inputs = self.processor.get_unconditional_inputs(num_samples=1)
-
-        output_values = model.generate(**unconditional_inputs, do_sample=False, max_new_tokens=10, guidance_scale=1.0)
-
-        # fmt: off
-        EXPECTED_VALUES = mindspore.tensor(
-            [
-                1.2741e-04, -8.0466e-05,  5.5789e-04,  1.0402e-03,  2.6547e-04,
-                1.5587e-05, -1.4210e-04, -9.7303e-05,  6.4504e-04,  5.0903e-04,
-                9.6474e-04,  1.0498e-03,  3.7210e-05, -5.3652e-04, -3.6579e-04, -2.5678e-04
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(output_values.shape == (1, 1, 4480))
-        self.assertTrue(ops.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, atol=1e-4))
-
-    @slow
-    def test_generate_unconditional_sampling(self):
-        model = self.model
-
-        # for stochastic sampling we can generate multiple outputs
-        unconditional_inputs = self.processor.get_unconditional_inputs(num_samples=2)
-
-        set_seed(0)
-
-        output_values = model.generate(
-            **unconditional_inputs, do_sample=True, max_new_tokens=10, guidance_scale=1.0, temperature=1.0, top_k=250
-        )
-
-        # fmt: off
-        EXPECTED_VALUES = mindspore.tensor(
-            [
-                -0.0085, -0.0160,  0.0028,  0.0005, -0.0095,  0.0028, -0.0122, -0.0299,
-                -0.0052, -0.0145,  0.0092,  0.0063, -0.0378, -0.0621, -0.0784, -0.0120,
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(output_values.shape == (2, 1, 4480))
-        self.assertTrue(ops.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, atol=1e-4))
-
-    @slow
-    def test_generate_text_prompt_greedy(self):
-        model = self.model
-        processor = self.processor
-
-        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="ms")
-
-        # prepare the encoder inputs
-        input_ids = inputs.input_ids
-        attention_mask = inputs.attention_mask
-
-        output_values = model.generate(
-            input_ids, attention_mask=attention_mask, do_sample=False, guidance_scale=None, max_new_tokens=10
-        )
-
-        # fmt: off
-        EXPECTED_VALUES = mindspore.tensor(
-            [
-                1.2741e-04, -8.0474e-05,  5.5789e-04,  1.0402e-03,  2.6547e-04,
-                1.5597e-05, -1.4210e-04, -9.7309e-05,  6.4504e-04,  5.0903e-04
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(output_values.shape == (2, 1, 4480))
-        self.assertTrue(ops.allclose(output_values[0, 0, :10].cpu(), EXPECTED_VALUES, atol=1e-4))
-
-    @slow
-    def test_generate_text_prompt_greedy_with_classifier_free_guidance(self):
-        model = self.model
-        processor = self.processor
-
-        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="ms")
-
-        # prepare the encoder inputs
-        input_ids = inputs.input_ids
-        attention_mask = inputs.attention_mask
-
-        output_values = model.generate(
-            input_ids, attention_mask=attention_mask, do_sample=False, guidance_scale=3, max_new_tokens=10
-        )
-
-        # fmt: off
-        EXPECTED_VALUES = mindspore.tensor(
-            [
-                1.2741e-04, -8.0474e-05,  5.5789e-04,  1.0402e-03,  2.6547e-04,
-                1.5597e-05, -1.4210e-04, -9.7309e-05,  6.4504e-04,  5.0903e-04,
-                9.6475e-04,  1.0499e-03,  3.7215e-05, -5.3651e-04, -3.6578e-04, -2.5678e-04
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(output_values.shape == (2, 1, 4480))
-        self.assertTrue(ops.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, atol=1e-4))
-
-    @slow
-    def test_generate_text_prompt_sampling(self):
-        model = self.model
-        processor = self.processor
-
-        inputs = processor(text=["80s music", "Club techno"], padding=True, return_tensors="ms")
-
-        # prepare the encoder inputs
-        input_ids = inputs.input_ids
-        attention_mask = inputs.attention_mask
-
-        set_seed(0)
-        output_values = model.generate(
-            input_ids,
-            attention_mask=attention_mask,
-            do_sample=True,
-            guidance_scale=None,
-            max_new_tokens=10,
-            temperature=1.0,
-            top_k=250,
-        )
-
-        # fmt: off
-        EXPECTED_VALUES = mindspore.tensor(
-            [
-                -0.0165, -0.0222, -0.0041, -0.0058, -0.0145, -0.0023, -0.0160, -0.0310,
-                -0.0055, -0.0127,  0.0104,  0.0105, -0.0326, -0.0611, -0.0744, -0.0083
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(output_values.shape == (2, 1, 4480))
-        self.assertTrue(ops.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, atol=1e-4))
-
-    @slow
-    def test_generate_text_audio_prompt(self):
-        model = self.model
-        processor = self.processor
-
-        audio = [get_bip_bip(duration=0.5), get_bip_bip(duration=1.0)]
-        text = ["80s music", "Club techno"]
-
-        inputs = processor(audio=audio, text=text, padding=True, return_tensors="ms")
-
-        output_values = model.generate(**inputs, do_sample=False, guidance_scale=None, max_new_tokens=10)
-
-        # fmt: off
-        EXPECTED_VALUES = mindspore.tensor(
-            [
-                -1.1999e-04, -2.2303e-04,  4.6296e-04,  1.0524e-03,  2.4827e-04,
-                -4.0294e-05, -1.2468e-04,  4.9846e-05,  7.1484e-04,  4.4198e-04,
-                7.9063e-04,  8.8141e-04, -6.1807e-05, -6.1856e-04, -3.6235e-04, -2.7226e-04
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(output_values.shape == (2, 1, 4480))
-        self.assertTrue(ops.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES, atol=1e-4))
-
-
-@require_mindspore
-class MusicgenMelodyStereoIntegrationTests(unittest.TestCase):
-    @cached_property
-    def model(self):
-        return MusicgenMelodyForConditionalGeneration.from_pretrained("ylacombe/musicgen-stereo-melody")
-
-    @cached_property
-    def processor(self):
-        return MusicgenMelodyProcessor.from_pretrained("ylacombe/musicgen-stereo-melody")
-
-    @slow
-    def test_generate_unconditional_greedy(self):
-        model = self.model
-
-        # only generate 1 sample with greedy - since it's deterministic all elements of the batch will be the same
-        unconditional_inputs = self.processor.get_unconditional_inputs(num_samples=1)
-
-        output_values = model.generate(**unconditional_inputs, do_sample=False, max_new_tokens=12, guidance_scale=1.0)
-
-        # fmt: off
-        EXPECTED_VALUES_LEFT = mindspore.tensor(
-            [
-                1.2742e-04, -8.0480e-05,  5.5788e-04,  1.0401e-03,  2.6547e-04,
-                1.5587e-05, -1.4211e-04, -9.7308e-05,  6.4503e-04,  5.0903e-04,
-                9.6475e-04,  1.0499e-03,  3.7205e-05, -5.3652e-04, -3.6579e-04, 2.5679e-04
-            ]
-        )
-        # fmt: on
-
-        # (bsz, channels, seq_len)
-        self.assertTrue(output_values.shape == (1, 2, 5760))
-        self.assertTrue(ops.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES_LEFT, atol=6e-4))
-        self.assertTrue(ops.allclose(output_values[0, 1, :16].cpu(), EXPECTED_VALUES_LEFT, atol=6e-4))
-
-    @slow
-    def test_generate_text_audio_prompt(self):
-        model = self.model
-        processor = self.processor
-
-        audio = [get_bip_bip(duration=0.5), get_bip_bip(duration=1.0)]
-        text = ["80s music", "Club techno"]
-
-        inputs = processor(audio=audio, text=text, padding=True, return_tensors="ms")
-
-        output_values = model.generate(**inputs, do_sample=False, guidance_scale=3.0, max_new_tokens=12)
-
-        # fmt: off
-        EXPECTED_VALUES_LEFT_FIRST_SAMPLE = mindspore.tensor(
-            [
-                -0.0862, -0.1021, -0.0936, -0.0754, -0.0616, -0.0456, -0.0354, -0.0298,
-                -0.0036,  0.0222,  0.0523,  0.0660,  0.0496,  0.0356,  0.0457,  0.0769
-            ]
-        )
-        EXPECTED_VALUES_RIGHT_SECOND_SAMPLE = mindspore.tensor(
-            [
-                -0.0327, -0.0450, -0.0264, -0.0278, -0.0365, -0.0272, -0.0401, -0.0574,
-                -0.0413, -0.0508, -0.0269, -0.0323, -0.0762, -0.1115, -0.1390, -0.0790
-            ]
-        )
-        # fmt: on
-
-        # (bsz, channels, seq_len)
-        self.assertTrue(output_values.shape == (2, 2, 5760))
-        self.assertTrue(ops.allclose(output_values[0, 0, :16].cpu(), EXPECTED_VALUES_LEFT_FIRST_SAMPLE, atol=1e-4))
-        self.assertTrue(ops.allclose(output_values[1, 1, :16].cpu(), EXPECTED_VALUES_RIGHT_SECOND_SAMPLE, atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/musicgen_melody/test_processor_musicgen_melody.py b/tests/transformers/models/musicgen_melody/test_processor_musicgen_melody.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mvp/__init__.py b/tests/transformers/models/mvp/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/mvp/test_modeling_mvp.py b/tests/transformers/models/mvp/test_modeling_mvp.py
deleted file mode 100644
index 62e04664b..000000000
--- a/tests/transformers/models/mvp/test_modeling_mvp.py
+++ /dev/null
@@ -1,812 +0,0 @@
-# coding=utf-8
-# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore MVP model."""
-
-import copy
-import tempfile
-import unittest
-
-from mindnlp.transformers import MvpConfig, is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-from mindnlp.utils import cached_property
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import (
-        MvpForCausalLM,
-        MvpForConditionalGeneration,
-        MvpForQuestionAnswering,
-        MvpForSequenceClassification,
-        MvpModel,
-        MvpTokenizer,
-    )
-    from mindnlp.transformers.models.mvp.modeling_mvp import MvpDecoder, MvpEncoder, shift_tokens_right
-
-
-def prepare_mvp_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids=None,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(config.encoder_layers, config.encoder_attention_heads)
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-class MvpModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-        inputs_dict = prepare_mvp_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def get_config(self):
-        return MvpConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.max_position_embeddings = 100
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = MvpModel(config=config).get_decoder().eval()
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-        head_mask = inputs_dict["head_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask.astype(attention_mask.dtype)], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = MvpModel(config=config).eval()
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = MvpEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
-            0
-        ]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = MvpDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-@require_mindspore
-class MvpHeadTests(unittest.TestCase):
-    vocab_size = 99
-
-    def _get_config_and_data(self):
-        input_ids = mindspore.tensor(
-            [
-                [71, 82, 18, 33, 46, 91, 2],
-                [68, 34, 26, 58, 30, 82, 2],
-                [5, 97, 17, 39, 94, 40, 2],
-                [76, 83, 94, 25, 70, 78, 2],
-                [87, 59, 41, 35, 48, 66, 2],
-                [55, 13, 16, 58, 5, 2, 1],  # note padding
-                [64, 27, 31, 51, 12, 75, 2],
-                [52, 64, 86, 17, 83, 39, 2],
-                [48, 61, 9, 24, 71, 82, 2],
-                [26, 1, 60, 48, 22, 13, 2],
-                [21, 5, 62, 28, 14, 76, 2],
-                [45, 98, 37, 86, 59, 48, 2],
-                [70, 70, 50, 9, 28, 0, 2],
-            ],
-            dtype=mindspore.int64,
-        )
-
-        batch_size = input_ids.shape[0]
-        config = MvpConfig(
-            vocab_size=self.vocab_size,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-        )
-        return config, input_ids, batch_size
-
-    def test_sequence_classification_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        labels = _long_tensor([2] * batch_size)
-        config.num_labels = 3
-        model = MvpForSequenceClassification(config)
-        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=labels)
-        expected_shape = (batch_size, config.num_labels)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-        self.assertIsInstance(outputs["loss"].item(), float)
-
-    def test_question_answering_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        sequence_labels = ids_tensor([batch_size], 2)
-        model = MvpForQuestionAnswering(config)
-        outputs = model(
-            input_ids=input_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-
-        self.assertEqual(outputs["start_logits"].shape, input_ids.shape)
-        self.assertEqual(outputs["end_logits"].shape, input_ids.shape)
-        self.assertIsInstance(outputs["loss"].item(), float)
-
-    def test_lm_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size)
-        lm_model = MvpForConditionalGeneration(config)
-        outputs = lm_model(input_ids=input_ids, labels=lm_labels)
-        expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-        self.assertIsInstance(outputs["loss"].item(), float)
-
-    def test_lm_uneven_forward(self):
-        config = MvpConfig(
-            vocab_size=self.vocab_size,
-            d_model=14,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=8,
-            decoder_ffn_dim=8,
-            max_position_embeddings=48,
-        )
-        lm_model = MvpForConditionalGeneration(config)
-        context = mindspore.tensor(
-            [[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]], dtype=mindspore.int64
-        )
-        summary = mindspore.tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]], dtype=mindspore.int64)
-        outputs = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary)
-        expected_shape = (*summary.shape, config.vocab_size)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-
-    def test_generate_beam_search(self):
-        input_ids = mindspore.tensor([[71, 82, 2], [68, 34, 2]], dtype=mindspore.int64)
-        config = MvpConfig(
-            vocab_size=self.vocab_size,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-        )
-        lm_model = MvpForConditionalGeneration(config)
-        lm_model.eval()
-
-        max_length = 5
-        generated_ids = lm_model.generate(
-            input_ids,
-            do_sample=True,
-            num_return_sequences=1,
-            num_beams=2,
-            no_repeat_ngram_size=3,
-            max_length=max_length,
-        )
-        self.assertEqual(generated_ids.shape, (input_ids.shape[0], max_length))
-
-    def test_shift_tokens_right(self):
-        input_ids = mindspore.tensor([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]], dtype=mindspore.int64)
-        shifted = shift_tokens_right(input_ids, 1, 2)
-        n_pad_before = input_ids.eq(1).float().sum()
-        n_pad_after = shifted.eq(1).float().sum()
-        self.assertEqual(shifted.shape, input_ids.shape)
-        self.assertEqual(n_pad_after, n_pad_before - 1)
-        self.assertTrue(ops.eq(shifted[:, 0], 2).all())
-
-    @slow
-    def test_tokenization(self):
-        tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
-        examples = [" Hello world", " DomDramg"]  # need leading spaces for equality
-        fairseq_results = [
-            mindspore.tensor([0, 20920, 232, 2]),
-            mindspore.tensor([0, 11349, 495, 4040, 571, 2]),
-        ]
-        for ex, desired_result in zip(examples, fairseq_results):
-            mvp_toks = tokenizer.encode(ex, return_tensors="ms").squeeze()
-            assert_tensors_close(desired_result.long(), mvp_toks, prefix=ex)
-
-    def test_generate_fp16(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        attention_mask = input_ids.ne(1)
-        model = MvpForConditionalGeneration(config).eval()
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    def test_dummy_inputs(self):
-        config, *_ = self._get_config_and_data()
-        model = MvpForConditionalGeneration(config).eval()
-        model(**model.dummy_inputs)
-
-    def test_resize_tokens_embeddings_more(self):
-        config, input_ids, _ = self._get_config_and_data()
-
-        def _get_embs(m):
-            return (m.get_input_embeddings().weight, m.get_output_embeddings().weight)
-
-        model = MvpForConditionalGeneration(config).eval()
-        input, output = _get_embs(model)
-        self.assertTrue(ops.eq(input, output).all())
-        new_vocab_size = 45
-        model.resize_token_embeddings(new_vocab_size)
-        input_new, output_new = _get_embs(model)
-        self.assertEqual(input_new.shape, (new_vocab_size, config.d_model))
-        self.assertEqual(output_new.shape, (new_vocab_size, config.d_model))
-        self.assertTrue(ops.eq(input_new, output_new).all())
-
-
-@require_mindspore
-class MvpModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (MvpModel, MvpForConditionalGeneration, MvpForSequenceClassification, MvpForQuestionAnswering)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (MvpForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": MvpModel,
-            "fill-mask": MvpForConditionalGeneration,
-            "question-answering": MvpForQuestionAnswering,
-            "summarization": MvpForConditionalGeneration,
-            "text-classification": MvpForSequenceClassification,
-            "text-generation": MvpForCausalLM,
-            "text2text-generation": MvpForConditionalGeneration,
-            "translation": MvpForConditionalGeneration,
-            "zero-shot": MvpForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    fx_compatible = False
-    test_pruning = False
-    test_missing_keys = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if (
-            pipeline_test_casse_name == "QAPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
-            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
-            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = MvpModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=MvpConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    # MvpForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (MvpModel, MvpForConditionalGeneration, MvpForQuestionAnswering):
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with no_grad():
-                model(**inputs)[0]
-
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = MvpForConditionalGeneration(config).eval()
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-
-def assert_tensors_close(a, b, atol=1e-12, prefix=""):
-    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if ops.allclose(a, b, atol=atol):
-            return True
-        raise
-    except Exception:
-        pct_different = (ops.gt((a - b).abs(), atol)).float().mean().item()
-        if a.numel() > 100:
-            msg = f"tensor values are {pct_different:.1%} percent different."
-        else:
-            msg = f"{a} != {b}"
-        if prefix:
-            msg = prefix + ": " + msg
-        raise AssertionError(msg)
-
-
-def _long_tensor(tok_lst):
-    return mindspore.tensor(tok_lst, dtype=mindspore.int64)
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class MvpModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_tokenizer(self):
-        return MvpTokenizer.from_pretrained("RUCAIBox/mvp")
-
-    @slow
-    def test_inference_no_head(self):
-        model = MvpModel.from_pretrained("RUCAIBox/mvp")
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        attention_mask = input_ids.ne(model.config.pad_token_id)
-        with no_grad():
-            output = model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
-        expected_shape = (1, 11, 1024)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[0.3461, 0.3624, 0.2689], [0.3461, 0.3624, 0.2689], [-0.1562, 1.1637, -0.3784]]
-        )
-        self.assertTrue(ops.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
-
-    @slow
-    def test_summarization_inference(self):
-        model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")
-        tok = self.default_tokenizer
-        PGE_ARTICLE = """ Listen to local radio broadcasts for advertisements that reference casinos in your area.\nIf none are in your area, listen to national radio broadcasts for advertisements of casinos in other areas.\nNote the location that is mentioned in each advertisement that involves a casino.\nIf no locations are mentioned, note any additional contact information, such as a website or phone number. Use that information to find out where the casinos are.;\n,\n\nIf you learn about more than 1 casino on the radio, use the Internet to search the distance between your location and each casino. Sites such as maps.google.com or mapquest.com will help you in this search.'"""  # fmt: skip
-        EXPECTED_SUMMARY = "Listen to the radio.\nUse the Internet."
-        dct = tok.batch_encode_plus(
-            [PGE_ARTICLE],
-            return_tensors="ms",
-        )
-
-        hypotheses_batch = model.generate(**dct)
-
-        decoded = tok.batch_decode(hypotheses_batch, skip_special_tokens=True)
-        self.assertEqual(EXPECTED_SUMMARY, decoded[0])
-
-
-class MvpStandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        d_model=16,
-        decoder_seq_length=7,
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=2,
-        decoder_ffn_dim=32,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=False,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.is_encoder_decoder = is_encoder_decoder
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = MvpConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            encoder_layers=self.decoder_layers,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_attention_heads=self.encoder_attention_heads,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.decoder_seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            lm_labels,
-        )
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = MvpDecoder(config=config).eval()
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        model = MvpDecoder(config=config).eval()
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class MvpStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (MvpDecoder, MvpForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (MvpForCausalLM,) if is_mindspore_available() else ()
-    fx_comptatible = True
-    test_pruning = False
-    is_encoder_decoder = False
-
-    def setUp(
-        self,
-    ):
-        self.model_tester = MvpStandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=MvpConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_attn_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    @unittest.skip(reason="Decoder cannot keep gradients")
-    def test_retain_grad_hidden_states_attentions(self):
-        return
\ No newline at end of file
diff --git a/tests/transformers/models/nllb/__init__.py b/tests/transformers/models/nllb/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/nllb/test_tokenization_nllb.py b/tests/transformers/models/nllb/test_tokenization_nllb.py
deleted file mode 100644
index e96626bde..000000000
--- a/tests/transformers/models/nllb/test_tokenization_nllb.py
+++ /dev/null
@@ -1,470 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-NLLB_test_tokenization
-"""
-import shutil
-import tempfile
-import unittest
-
-from mindnlp.transformers import (
-    SPIECE_UNDERLINE,
-)
-from mindnlp.transformers.tokenization_utils_base import AddedToken,BatchEncoding
-
-from mindnlp.transformers.models.nllb.tokenization_nllb import FAIRSEQ_LANGUAGE_CODES,NllbTokenizer
-from mindnlp.transformers.models.nllb.tokenization_nllb_fast import NllbTokenizerFast
-from mindnlp.utils.testing_utils import (
-    get_tests_dir,
-    nested_simplify,
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    is_mindspore_available,
-)
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
-
-
-if is_mindspore_available():
-    from mindnlp.transformers.models.bart.modeling_bart import shift_tokens_right
-
-EN_CODE = 256047
-RO_CODE = 256145
-
-
-@require_sentencepiece
-@require_tokenizers
-class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "facebook/nllb-200-distilled-600M"
-    tokenizer_class = NllbTokenizer
-    rust_tokenizer_class = NllbTokenizerFast
-    test_rust_tokenizer = True
-    test_sentencepiece = True
-    from_pretrained_kwargs = {}
-
-    def setUp(self):
-        super().setUp()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def test_full_tokenizer(self):
-        tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
-
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens),
-            [value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382]],
-        )
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "9",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "é",
-                ".",
-            ],
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(
-            ids,
-            [
-                value + tokenizer.fairseq_offset
-                for value in [8, 21, 84, 55, 24, 19, 7, 2, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 2, 4]
-            ],
-        )
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "<unk>",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "<unk>",
-                ".",
-            ],
-        )
-
-    # overwrite from test_tokenization_common to speed up test
-    def test_save_pretrained(self):
-        self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-nllb", {})
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it save with the same files + the tokenizer.json file for the fast one
-                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
-                tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
-                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
-                # Save tokenizer rust, legacy_format=True
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it save with the same files
-                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
-                # Save tokenizer rust, legacy_format=False
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it saved the tokenizer.json file
-                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
-    @require_mindspore
-    def test_prepare_seq2seq_batch(self):
-        if not self.test_seq2seq:
-            self.skipTest(reason="test_seq2seq is set to False")
-
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Longer text that will definitely require truncation.
-                src_text = [
-                    " UN Chief Says There Is No Military Solution in Syria",
-                    " Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for"
-                    " Syria is that 'there is no military solution' to the nearly five-year conflict and more weapons"
-                    " will only worsen the violence and misery for millions of people.",
-                ]
-                tgt_text = [
-                    "Şeful ONU declară că nu există o soluţie militară în Siria",
-                    "Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al"
-                    ' Rusiei pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi'
-                    " că noi arme nu vor face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.",
-                ]
-                try:
-                    batch = tokenizer.prepare_seq2seq_batch(
-                        src_texts=src_text,
-                        tgt_texts=tgt_text,
-                        max_length=3,
-                        max_target_length=10,
-                        return_tensors="ms",
-                        src_lang="eng_Latn",
-                        tgt_lang="ron_Latn",
-                    )
-                except NotImplementedError:
-                    self.skipTest(reason="Encountered NotImplementedError when calling prepare_seq2seq_batch")
-                self.assertEqual(batch.input_ids.shape[1], 3)
-                self.assertEqual(batch.labels.shape[1], 10)
-                # max_target_length will default to max_length if not specified
-                batch = tokenizer.prepare_seq2seq_batch(
-                    src_text, tgt_texts=tgt_text, max_length=3, return_tensors="ms"
-                )
-                self.assertEqual(batch.input_ids.shape[1], 3)
-                self.assertEqual(batch.labels.shape[1], 3)
-
-                batch_encoder_only = tokenizer.prepare_seq2seq_batch(
-                    src_texts=src_text, max_length=3, max_target_length=10, return_tensors="ms"
-                )
-                self.assertEqual(batch_encoder_only.input_ids.shape[1], 3)
-                self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
-                self.assertNotIn("decoder_input_ids", batch_encoder_only)
-
-    @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.")
-    def test_save_slow_from_fast_and_reload_fast(self):
-        pass
-
-    def test_special_tokens_initialization(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                added_tokens = [AddedToken("<special>", lstrip=True)]
-
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                )
-                r_output = tokenizer_r.encode("Hey this is a <special> token")
-
-                special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0]
-
-                self.assertTrue(special_token_id in r_output)
-
-                if self.test_slow_tokenizer:
-                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name,
-                        additional_special_tokens=added_tokens,
-                        **kwargs,  # , from_slow=True <- unfortunately too slow to convert
-                    )
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                    )
-
-                    p_output = tokenizer_p.encode("Hey this is a <special> token")
-
-                    cr_output = tokenizer_cr.encode("Hey this is a <special> token")
-
-                    self.assertEqual(p_output, r_output)
-                    self.assertEqual(cr_output, r_output)
-                    self.assertTrue(special_token_id in p_output)
-                    self.assertTrue(special_token_id in cr_output)
-
-    @unittest.skip(reason="Need to fix this after #26538")
-    def test_training_new_tokenizer(self):
-        pass
-
-    def test_new_language_codes(self):
-        code1, code2 = "myv_Cyrl", "myv_Latn"
-        new_codes = FAIRSEQ_LANGUAGE_CODES + [code1, code2]
-        # here I create a tokenizer with the default behaviour
-        tok1 = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
-        # here I enhance the model's vocabulary with two new language codes
-        tok2 = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", additional_special_tokens=new_codes)
-
-        # testing that the new codes can work
-        self.assertEqual(len(tok2), len(tok1) + 2)
-        tok2.tgt_lang = code1
-        tok2.src_lang = code2
-
-        self.assertEqual(tok2("šumbrat!").input_ids[0], tok2.convert_tokens_to_ids(code2))
-        with tempfile.TemporaryDirectory() as tempdir:
-            # testing that saving and loading the tokenizer preserves the new behaviour
-            tok2.save_pretrained(tempdir)
-            tok3 = NllbTokenizer.from_pretrained(tempdir)
-            self.assertEqual(tok2.get_vocab(), tok3.get_vocab())
-            tok3.src_lang = code2
-            self.assertEqual(tok3("šumbrat!").input_ids[0], tok3.convert_tokens_to_ids(code2))
-
-            # testing that saving and loading the tokenizer preserves the new behaviour
-            tok2.save_pretrained(tempdir)
-            tok3 = NllbTokenizer(f"{tempdir}/sentencepiece.bpe.model", additional_special_tokens=None)
-            self.assertEqual(len(tok3), 256204)  # legacy
-            tok4 = NllbTokenizer(f"{tempdir}/sentencepiece.bpe.model", additional_special_tokens=[])
-            self.assertEqual(len(tok4), 256002)
-            tok5 = NllbTokenizer(f"{tempdir}/sentencepiece.bpe.model", additional_special_tokens=[code1, code2])
-            self.assertEqual(len(tok5), 256004)
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class NllbDistilledIntegrationTest(unittest.TestCase):
-    checkpoint_name = "facebook/nllb-200-distilled-600M"
-    src_text = [
-        " UN Chief Says There Is No Military Solution in Syria",
-        """ Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that 
-        "there is no military solution" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people.""",
-    ]
-    tgt_text = [
-        "Şeful ONU declară că nu există o soluţie militară în Siria",
-        "Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei"
-        ' pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor'
-        " face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.",
-    ]
-    expected_src_tokens = [
-        256047,
-        16297,
-        134408,
-        8165,
-        248066,
-        14734,
-        950,
-        1135,
-        105721,
-        3573,
-        83,
-        27352,
-        108,
-        49486,
-        2,
-    ]
-
-    @classmethod
-    def setUpClass(cls):
-        cls.tokenizer: NllbTokenizer = NllbTokenizer.from_pretrained(
-            cls.checkpoint_name, src_lang="eng_Latn", tgt_lang="ron_Latn"
-        )
-        cls.pad_token_id = 1
-        return cls
-
-    def test_enro_tokenizer_batch_encode_plus(self):
-        ids = self.tokenizer.batch_encode_plus(self.src_text).input_ids[0]
-        self.assertListEqual(self.expected_src_tokens, ids)
-
-    def test_enro_tokenizer_decode_ignores_language_codes(self):
-        self.assertIn(RO_CODE, self.tokenizer.all_special_ids)
-        generated_ids = [RO_CODE, 4254, 98068, 112923, 39072, 3909, 713, 102767, 26, 17314, 35642, 14683, 33118, 2022, 66987, 2, 256047]  # fmt: skip
-
-        result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
-        expected_romanian = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
-        self.assertEqual(result, expected_romanian)
-        self.assertNotIn(self.tokenizer.eos_token, result)
-
-    def test_enro_tokenizer_truncation(self):
-        src_text = ["this is gunna be a long sentence " * 20]
-        assert isinstance(src_text[0], str)
-        desired_max_length = 10
-        ids = self.tokenizer(src_text, max_length=desired_max_length, truncation=True).input_ids[0]
-        self.assertEqual(ids[-1], 2)
-        self.assertEqual(ids[0], EN_CODE)
-        self.assertEqual(len(ids), desired_max_length)
-
-    def test_mask_token(self):
-        self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["<mask>", "ar_AR"]), [256203, 3])
-
-    @require_mindspore
-    def test_enro_tokenizer_prepare_batch(self):
-        batch = self.tokenizer(
-            self.src_text,
-            text_target=self.tgt_text,
-            padding=True,
-            truncation=True,
-            max_length=len(self.expected_src_tokens),
-            return_tensors="ms",
-        )
-        batch["decoder_input_ids"] = shift_tokens_right(
-            batch["labels"], self.tokenizer.pad_token_id, self.tokenizer.convert_tokens_to_ids("ron_Latn")
-        )
-
-        self.assertIsInstance(batch, BatchEncoding)
-
-        self.assertEqual((2, 15), batch.input_ids.shape)
-        self.assertEqual((2, 15), batch.attention_mask.shape)
-        result = batch.input_ids.tolist()[0]
-        self.assertListEqual(self.expected_src_tokens, result)
-        self.assertEqual(RO_CODE, batch.decoder_input_ids[0, 0])  # EOS
-        # Test that special tokens are reset
-        self.assertEqual(self.tokenizer.prefix_tokens, [EN_CODE])
-        self.assertEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
-
-    def test_seq2seq_max_length(self):
-        batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="ms")
-        targets = self.tokenizer(
-            text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="ms"
-        )
-        labels = targets["input_ids"]
-        batch["decoder_input_ids"] = shift_tokens_right(
-            labels,
-            self.tokenizer.pad_token_id,
-            decoder_start_token_id=self.tokenizer.convert_tokens_to_ids(self.tokenizer.tgt_lang),
-        )
-
-        self.assertEqual(batch.input_ids.shape[1], 3)
-        self.assertEqual(batch.decoder_input_ids.shape[1], 10)
-
-    @require_mindspore
-    def test_tokenizer_translation(self):
-        inputs = self.tokenizer._build_translation_inputs(
-            "A test", return_tensors="ms", src_lang="eng_Latn", tgt_lang="fra_Latn"
-        )
-
-        self.assertEqual(
-            nested_simplify(inputs),
-            {
-                # A, test, EOS, en_XX
-                "input_ids": [[256047, 70, 7356, 2]],
-                "attention_mask": [[1, 1, 1, 1]],
-                # ar_AR
-                "forced_bos_token_id": 256057,
-            },
-        )
-
-    @require_mindspore
-    def test_legacy_behaviour(self):
-        self.tokenizer.legacy_behaviour = True
-        inputs = self.tokenizer(
-            "UN Chief says there is no military solution in Syria", src_lang="eng_Latn", tgt_lang="fra_Latn"
-        )
-        self.assertEqual(
-            inputs.input_ids, [16297, 134408, 25653, 6370, 248, 254, 103929, 94995, 108, 49486, 2, 256047]
-        )
-
-        self.tokenizer.legacy_behaviour = False
-        inputs = self.tokenizer(
-            "UN Chief says there is no military solution in Syria", src_lang="eng_Latn", tgt_lang="fra_Latn"
-        )
-        self.assertEqual(
-            inputs.input_ids, [256047, 16297, 134408, 25653, 6370, 248, 254, 103929, 94995, 108, 49486, 2]
-        )
diff --git a/tests/transformers/models/nllb_moe/__init__.py b/tests/transformers/models/nllb_moe/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/nllb_moe/test_modeling_nllb_moe.py b/tests/transformers/models/nllb_moe/test_modeling_nllb_moe.py
deleted file mode 100644
index dd55d0ae3..000000000
--- a/tests/transformers/models/nllb_moe/test_modeling_nllb_moe.py
+++ /dev/null
@@ -1,674 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore NLLB-MoE model."""
-
-import copy
-import tempfile
-import unittest
-
-from mindnlp.transformers import NllbMoeConfig
-from mindnlp.engine import set_seed
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-    is_mindspore_available,
-)
-from mindnlp.utils import cached_property
-from mindnlp.core import no_grad
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import (
-        NllbMoeForConditionalGeneration,
-        NllbMoeModel,
-        NllbTokenizer,
-    )
-    from mindnlp.transformers.models.nllb_moe.modeling_nllb_moe import (
-        NllbMoeDecoder,
-        NllbMoeEncoder,
-        NllbMoeTop2Router,
-    )
-
-
-class NllbMoeModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="relu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        encoder_layerdrop=0.0,
-        decoder_layerdrop=0.0,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        num_experts=4,
-        encoder_sparse_step=2,
-        decoder_sparse_step=1,
-        expert_capacity=100,
-        router_jitter_noise=0.0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.encoder_layerdrop = encoder_layerdrop
-        self.decoder_layerdrop = decoder_layerdrop
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.encoder_sparse_step = encoder_sparse_step
-        self.decoder_sparse_step = decoder_sparse_step
-        self.expert_capacity = expert_capacity
-        self.router_jitter_noise = router_jitter_noise
-        self.num_experts = num_experts
-
-    def prepare_nllb_moe_inputs_dict(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-    ):
-        if attention_mask is None:
-            attention_mask = input_ids.ne(config.pad_token_id)
-        if decoder_attention_mask is None:
-            decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-        if head_mask is None:
-            head_mask = ops.ones(config.encoder_layers, config.encoder_attention_heads)
-        if decoder_head_mask is None:
-            decoder_head_mask = ops.ones(
-                config.decoder_layers, config.decoder_attention_heads
-            )
-        if cross_attn_head_mask is None:
-            cross_attn_head_mask = ops.ones(
-                config.decoder_layers, config.decoder_attention_heads
-            )
-        return {
-            "input_ids": input_ids,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "decoder_attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-        }
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-        decoder_input_ids = ids_tensor(
-            [self.batch_size, self.seq_length], self.vocab_size
-        )
-
-        # we need to clamp the input ids here to avoid having pad token in between
-        # this is because for NllbMoe the position_ids are prepared such that
-        # all pad tokens have pos id = 2 and rest are between 2..seq_length
-        # and the seq_length here is seq_length - num_pad_tokens
-        # but when using past, there is no way of knowing if the past input ids had
-        # pad tokens in them, which results in incorrect seq_lenth and which in turn results in
-        # position_ids being off by num_pad_tokens in past input
-        input_ids = input_ids.clamp(self.pad_token_id + 1)
-        decoder_input_ids = decoder_input_ids.clamp(self.pad_token_id + 1)
-
-        config = self.get_config()
-        inputs_dict = self.prepare_nllb_moe_inputs_dict(
-            config, input_ids, decoder_input_ids
-        )
-        return config, inputs_dict
-
-    def get_config(self):
-        return NllbMoeConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            encoder_layerdrop=self.encoder_layerdrop,
-            decoder_layerdrop=self.decoder_layerdrop,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            expert_capacity=self.expert_capacity,
-            router_jitter_noise=self.router_jitter_noise,
-            decoder_sparse_step=self.decoder_sparse_step,
-            encoder_sparse_step=self.encoder_sparse_step,
-            num_experts=self.num_experts,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    @require_mindspore
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = NllbMoeModel(config=config).get_decoder().eval()
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-        head_mask = inputs_dict["head_mask"]
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            use_cache=True,
-        )
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask.to(mindspore.int64), next_attn_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[
-            "last_hidden_state"
-        ]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-        )["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[
-            :, -3:, random_slice_idx
-        ]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(
-            ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
-        )
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = NllbMoeModel(config=config).eval()
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = NllbMoeEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(
-            inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"]
-        )[0]
-
-        self.parent.assertTrue(
-            (encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item()
-            < 1e-3
-        )
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = NllbMoeDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
-        )[0]
-
-        self.parent.assertTrue(
-            (last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3
-        )
-
-
-@require_mindspore
-class NllbMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (NllbMoeModel, NllbMoeForConditionalGeneration)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (NllbMoeForConditionalGeneration,) if is_mindspore_available() else ()
-    )
-    is_encoder_decoder = True
-    fx_compatible = False
-    test_pruning = False
-    test_missing_keys = True
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = NllbMoeModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=NllbMoeConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(
-                    tmpdirname, output_loading_info=True
-                )
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        config.decoder_sparse_step = 0
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(
-            config, inputs_dict
-        )
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (NllbMoeModel, NllbMoeForConditionalGeneration):
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with no_grad():
-                model(**inputs)[0]
-
-    # @require_torch_fp16
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = NllbMoeForConditionalGeneration(config).eval()
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(
-            num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3
-        )
-
-    def test_get_loss(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_dict["output_router_logits"] = True
-        input_dict["labels"] = input_dict["input_ids"]
-        model = NllbMoeForConditionalGeneration(config).eval()
-        out = model(**input_dict)
-        self.assertIsNotNone(out.loss)
-        self.assertIsNotNone(model(**input_dict)["encoder_router_logits"][1])
-        self.assertIsNotNone(model(**input_dict)["decoder_router_logits"][0])
-
-    @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
-    )
-    def test_load_save_without_tied_weights(self):
-        pass
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-@slow
-class NllbMoeModelIntegrationTests(unittest.TestCase):
-    @require_mindspore
-    @cached_property
-    def model_inputs(self):
-        return {
-            "input_ids": mindspore.Tensor(
-                [
-                    [
-                        28768,
-                        248,
-                        6399,
-                        9,
-                        65972,
-                        452,
-                        1925,
-                        629,
-                        123543,
-                        248075,
-                        2,
-                        256047,
-                    ],
-                    [117, 7027, 7195, 202, 44778, 248075, 2, 256047, 1, 1, 1, 1],
-                ]
-            ),
-            "attention_mask": mindspore.Tensor(
-                [
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                    [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
-                ]
-            ),
-            "decoder_input_ids": mindspore.Tensor([[2, 256057], [2, 256057]]),
-        }
-
-    @cached_property
-    def tokenizer(self):
-        return NllbTokenizer.from_pretrained(
-            "hf-internal-testing/random-nllb-moe-2-experts"
-        )
-
-    @cached_property
-    def big_model(self):
-        return NllbMoeForConditionalGeneration.from_pretrained(
-            "facebook/nllb-moe-54b" # 240G
-        )
-
-    def inference_no_head(self):
-        model = NllbMoeModel.from_pretrained(
-            "hf-internal-testing/random-nllb-moe-2-experts"
-        ).eval()
-        with no_grad():
-            output = model(**self.model_inputs)
-        # fmt: off
-        EXPECTED_ENCODER_STATE = mindspore.Tensor([ 0.3920, -0.1974, -0.0279,  0.3463, -0.8306, -1.0629, -0.4643,  2.0563, 1.1123,  0.3566, -0.9291, -0.3840, -0.2527, -0.9858,  1.5185, -1.1346, 0.0323, -0.9103, -0.3647, -0.4462, -0.9720, -0.3541,  0.1777, -0.4647, 1.6970, -0.9062,  0.2727, -1.0737,  0.8785,  0.4324])
-        EXPECTED_DECODER_STATE = mindspore.Tensor([-6.0425e-02, -2.0015e-01,  6.0575e-02, -8.6366e-01, -1.1310e+00, 6.8369e-01,  7.5615e-01,  7.3555e-01,  2.3071e-01,  1.5954e+00, -7.0728e-01, -2.2647e-01, -1.3292e+00,  4.8246e-01, -6.9153e-01, -1.8199e-02, -7.3664e-01,  1.5902e-03,  1.0760e-01,  1.0298e-01, -9.3933e-01, -4.6567e-01,  8.0417e-01,  1.5243e+00,  5.5844e-01, -9.9239e-02,  1.4885e+00,  7.1527e-02, -5.2612e-01,  9.4435e-02])
-        # fmt: on
-        self.assertTrue(ops.allclose(output.encoder_last_hidden_state[1, 0, :30], EXPECTED_ENCODER_STATE, rtol=6e-3, atol=9e-3))
-        self.assertTrue(ops.allclose(output.last_hidden_state[1, 0, :30], EXPECTED_DECODER_STATE, rtol=6e-3, atol=9e-3))
-
-    def test_inference_logits(self):
-        r"""
-        Logits testing to check implementation consistency between `fairseq` implementation
-        and `transformers` implementation of NLLB-MoE transformers. We only check the logits
-        of the second sample of the batch, as it is padded.
-        """
-        model = NllbMoeForConditionalGeneration.from_pretrained(
-            "hf-internal-testing/random-nllb-moe-2-experts"
-        ).eval()
-        with no_grad():
-            output = model(**self.model_inputs)
-
-        EXPECTED_LOGTIS = mindspore.Tensor([-0.3059, 0.0000, 9.3029, 0.6456, -0.9148, 1.7836, 0.6478, 0.9438, -0.5272, -0.6617, -1.2717, 0.4564, 0.1345, -0.2301, -1.0140, 1.1427, -1.5535, 0.1337, 0.2082, -0.8112, -0.3842, -0.3377, 0.1256, 0.6450, -0.0452, 0.0219, 1.4274, -0.4991, -0.2063, -0.4409,])  # fmt: skip
-
-        self.assertTrue(ops.allclose(output.logits[1, 0, :30], EXPECTED_LOGTIS, rtol=6e-3, atol=9e-3))
-
-    @unittest.skip(reason="This requires 300GB of RAM")
-    def test_large_logits(self):
-        model = self.big_model
-        with no_grad():
-            output = model(**self.model_inputs)
-
-        # fmt: off
-        EXPECTED_ENCODER_STATE = mindspore.Tensor([ 0.1696, -0.0059,  0.0489,  0.0479, -0.4222, -0.2178, -0.1372, -0.0860, -0.4249, -0.0081, -0.1186,  0.6678,  0.0160,  0.4140,  0.1799,  0.0672, -0.4941,  0.0173, -0.0740,  0.0845, -0.2197,  0.4465,  0.2268, -0.1752, -0.0562,  0.1033, -0.0869, -0.5490,  0.0582,  0.2165])
-        EXPECTED_DECODER_STATE = mindspore.Tensor([ 0.0374, -0.1055, -0.1060, -0.1711, -0.0540, -0.1183, -0.0779,  0.0610, -0.0279, -0.0848,  0.0222,  0.0372, -0.0298, -0.0861, -0.0354, -0.0103,  0.0538, -0.0148, -0.0105,  0.0224,  0.0629, -0.0291, -0.0671,  0.0173, -0.0066, -0.0245, -0.0499,  0.0760, -0.0067,  0.0086])
-        EXPECTED_LOGTIS = mindspore.Tensor([ 0.3834,  0.2057,  4.5399,  0.8301,  0.4810,  0.9325,  0.9928,  0.9574,  0.5517,  0.9156,  0.2698,  0.6728,  0.7121,  0.3080,  0.4693,  0.5756,  1.0407,  0.2219,  0.3714,  0.5699,  0.5547,  0.8472,  0.3178,  0.1286,  0.1791,  0.9391,  0.5153, -0.2146,  0.1689,  0.6816])
-        # fmt: on
-
-        self.assertTrue(ops.allclose(output.encoder_last_hidden_state[1, 0, :30], EXPECTED_ENCODER_STATE, rtol=6e-3, atol=9e-3))
-        self.assertTrue(ops.allclose(output.last_hidden_state[1, 0, :30], EXPECTED_DECODER_STATE, rtol=6e-3, atol=9e-3))
-        self.assertTrue(ops.allclose(output.logits[1, 0, :30], EXPECTED_LOGTIS, rtol=6e-3, atol=9e-3))
-
-    @unittest.skip(reason="This requires 300GB of RAM")
-    def test_seq_to_seq_generation(self):
-        model = self.big_model
-        tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-moe-54b")
-
-        # first 6 samples of load_dataset("facebook/flores", "eng_Latn-fra_Latn"), devtest. Truth are very similar to the fairseq translation files
-        FIRST_6_FLORES_200 = [
-            'We now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.',
-            "Dr. Ehud Ur, professor of medicine at Dalhousie University in Halifax, Nova Scotia and chair of the clinical and scientific division of the Canadian Diabetes Association cautioned that the research is still in its early days.",
-            "Like some other experts, he is skeptical about whether diabetes can be cured, noting that these findings have no relevance to people who already have Type 1 diabetes.",
-            "On Monday, Sara Danius, permanent secretary of the Nobel Committee for Literature at the Swedish Academy, publicly announced during a radio program on Sveriges Radio in Sweden the committee, unable to reach Bob Dylan directly about winning the 2016 Nobel Prize in Literature, had abandoned its efforts to reach him.",
-            'Danius said, "Right now we are doing nothing. I have called and sent emails to his closest collaborator and received very friendly replies. For now, that is certainly enough."',
-            "Previously, Ring's CEO, Jamie Siminoff, remarked the company started when his doorbell wasn't audible from his shop in his garage.",
-        ]
-        inputs = tokenizer(FIRST_6_FLORES_200, padding=True, return_tensors="ms")
-        batch_translation = model.generate(
-            **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"]
-        )
-
-        EXPECTED_FAIRSEQ_TRANSLATION = [
-            '"Nous avons maintenant des souris de 4 mois non diabétiques qui étaient diabétiques", a-t-il ajouté.',
-            "Le docteur Ehud Ur, professeur de médecine à l'université Dalhousie, à Halifax, en Nouvelle-Écosse, et président de la division clinique et scientifique de l'Association canadienne du diabète, prévient que la recherche n'en est qu'à ses débuts.",
-            "Comme d'autres spécialistes, il est sceptique quant à la guérison du diabète.",
-            "Lundi, Sara Danius, secrétaire permanente du Comité Nobel de littérature à l'Académie suédoise, a annoncé publiquement lors d'une émission de radio sur Sveriges Radio en Suède que le comité, incapable de joindre Bob Dylan directement pour lui annoncer le prix Nobel de littérature 2016, avait abandonné ses efforts pour le joindre.",
-            "Danius a déclaré: \"Pour l'instant, nous ne faisons rien. J'ai appelé et envoyé des courriels à son plus proche collaborateur et j'ai reçu des réponses très amicales. Pour l'instant, c'est certainement suffisant\".",
-            "Auparavant, le PDG de Ring, Jamie Siminoff, a fait remarquer que la société avait commencé lorsque sa sonnette n'était pas audible depuis son magasin dans son garage.",
-        ]
-
-        translation = tokenizer.batch_decode(
-            batch_translation.tolist(),
-            clean_up_tokenization_spaces=True,
-            skip_special_tokens=True,
-        )
-        assert translation == EXPECTED_FAIRSEQ_TRANSLATION
-
-
-@require_mindspore
-class NllbMoeRouterTest(unittest.TestCase):
-    r"""
-    Switch Transformers has different blocks from classic transformer based models.
-    The Swift MLP contains a Router class, that has to be tested to check if it is correctly implemented
-
-    Original implementation of the routers here:
-
-    """
-
-    config = NllbMoeConfig(
-        num_experts=4,
-        hidden_size=32,
-        d_ff=16,
-        expert_capacity=4,
-    )
-    batch_size = 2
-    sequence_length = 20
-
-    @unittest.skip
-    def test_top_2_routing(self):
-        # test routing with minimal reproduction
-        mask = ops.ones((self.batch_size, self.sequence_length), dtype=mindspore.bool_)
-        mask[0][0] = False
-        mask[1][0] = False
-        mask = mask.reshape(-1)
-        set_seed(0)
-        hidden_states = ops.rand(
-            (self.batch_size, self.sequence_length, self.config.hidden_size)
-        )
-        classfier = nn.Linear(self.config.hidden_size, self.config.num_experts)
-        hf_router = NllbMoeTop2Router(self.config)
-
-        _, _, hidden_dim = hidden_states.shape
-        logits = classfier(
-            hidden_states.reshape((self.batch_size * self.sequence_length), hidden_dim)
-        )
-        top_1_mask, router_probs = hf_router.route_tokens(logits, padding_mask=mask)
-        ops.argmax(top_1_mask, dim=-1)
-        router_mask = router_probs.bool()
-        set_seed(0)
-        experts = [
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.Linear(hidden_dim, hidden_dim),
-        ]
-        hidden_states = hidden_states.reshape(
-            (self.batch_size * self.sequence_length), hidden_dim
-        )
-        masked_hidden_states = ops.einsum("bm,be->ebm", hidden_states, router_mask)
-        for idx, expert in enumerate(experts):
-            token_indices = router_mask[:, idx]
-
-            combining_weights = router_probs[token_indices, idx]
-            expert_input = masked_hidden_states[idx, token_indices]
-            if expert_input.shape[0] == 0:
-                continue
-            expert_output = expert(expert_input)
-
-            expert_output *= 1 - self.config.moe_token_dropout
-            masked_hidden_states[idx, token_indices] = ops.einsum(
-                "b,be->be", combining_weights, expert_output
-            )
-        hidden_states = masked_hidden_states.sum(axis=0).reshape(
-            self.batch_size, self.sequence_length, hidden_dim
-        )
-
-        EXPECTED_MEAN_FAIRSEQ_HIDDEN_STATES = mindspore.Tensor([[7.0340e-04,  2.7997e-03, -1.3351e-02, -7.6705e-03, -3.5089e-03, 3.9773e-03,  7.4593e-03,  1.2566e-02,  3.5860e-03, -2.7448e-02,-1.3731e-02, -1.0534e-02, -1.3606e-02, -1.5048e-02, -2.8914e-03,-5.0371e-03, -1.3963e-03,  6.0076e-03, -1.1380e-02, -1.4620e-02, 5.2401e-03,  8.4660e-04, -1.5319e-03, -1.6735e-02,  1.1302e-02, 3.6119e-03,  4.6084e-03, -1.3458e-02,  7.7792e-05,  1.4312e-02, 4.9107e-03, -5.0936e-03], [-4.4538e-03,  3.1026e-03,  1.4121e-04, -4.8121e-03, -5.6279e-03, 7.2493e-03,  3.9769e-03,  1.1114e-02, -1.5666e-03, -2.3477e-02, 8.7268e-03,  1.3446e-02, -2.8845e-05, -1.7287e-02,  8.7619e-03, -4.5316e-03, -1.2164e-02,  5.7461e-03, -4.5861e-03, -9.3907e-03, 2.9808e-02,  8.9206e-04, -7.6232e-04, -1.4173e-02,  3.0208e-03, 1.5310e-02,  9.7717e-03,  3.1014e-03,  7.8042e-03,  8.0197e-03, 3.4784e-03, -7.1728e-03]])  # fmt: skip
-
-        self.assertTrue(
-            ops.allclose(
-                hidden_states.mean(1), EXPECTED_MEAN_FAIRSEQ_HIDDEN_STATES, 1e-4
-            )
-        )
-
-    @unittest.skip
-    def test_batch_prioritized_routing(self):
-        set_seed(0)
-        config = NllbMoeConfig(
-            num_experts=4,
-            hidden_size=32,
-            d_ff=16,
-            expert_capacity=4,
-            second_expert_policy="random",
-        )
-        mask = ops.zeros(
-            (self.batch_size * self.sequence_length), dtype=mindspore.bool_
-        )
-        logits = ops.rand((self.batch_size * self.sequence_length, 4))
-        # logits = logits.tolist()
-        # logits = [[0.4963, 0.7682, 0.0885, 0.1320],  [0.3074, 0.6341, 0.4901, 0.8964], [0.4556, 0.6323, 0.3489, 0.4017],[0.0223, 0.1689, 0.2939, 0.5185],[0.6977, 0.8000, 0.1610, 0.2823],[0.6816, 0.9152, 0.3971, 0.8742],[0.4194, 0.5529, 0.9527, 0.0362],[0.1852, 0.3734, 0.3051, 0.9320],[0.1759, 0.2698, 0.1507, 0.0317],[0.2081, 0.9298, 0.7231, 0.7423],[0.5263, 0.2437, 0.5846, 0.0332],[0.1387, 0.2422, 0.8155, 0.7932],[0.2783, 0.4820, 0.8198, 0.9971],[0.6984, 0.5675, 0.8352, 0.2056],[0.5932, 0.1123, 0.1535, 0.2417],[0.7262, 0.7011, 0.2038, 0.6511],[0.7745, 0.4369, 0.5191, 0.6159],[0.8102, 0.9801, 0.1147, 0.3168],[0.6965, 0.9143, 0.9351, 0.9412],[0.5995, 0.0652, 0.5460, 0.1872],[0.0340, 0.9442, 0.8802, 0.0012],[0.5936, 0.4158, 0.4177, 0.2711],[0.6923, 0.2038, 0.6833, 0.7529],[0.8579, 0.6870, 0.0051, 0.1757],[0.7497, 0.6047, 0.1100, 0.2121],[0.9704, 0.8369, 0.2820, 0.3742],[0.0237, 0.4910, 0.1235, 0.1143],[0.4725, 0.5751, 0.2952, 0.7967],[0.1957, 0.9537, 0.8426, 0.0784],[0.3756, 0.5226, 0.5730, 0.6186],[0.6962, 0.5300, 0.2560, 0.7366],[0.0204, 0.2036, 0.3748, 0.2564],[0.3251, 0.0902, 0.3936, 0.6069],[0.1743, 0.4743, 0.8579, 0.4486],[0.5139, 0.4569, 0.6012, 0.8179],[0.9736, 0.8175, 0.9747, 0.4638],[0.0508, 0.2630, 0.8405, 0.4968],[0.2515, 0.1168, 0.0321, 0.0780],[0.3986, 0.7742, 0.7703, 0.0178],[0.8119, 0.1087, 0.3943, 0.2973]]
-        # logits = mindspore.Tensor(logits)
-        config.batch_prioritized_routing = True
-        router = NllbMoeTop2Router(config)
-        top_1_mask, _ = router.route_tokens(logits, padding_mask=mask)
-        # check that the routing is batch first. One of the last token is routed while expert capacity is very small
-        # this means that it had a greater probability of being routed
-        assert top_1_mask[-1, 0] == 1
-
-    @unittest.skip
-    def test_second_expert_policy(self):
-        config = NllbMoeConfig(
-            num_experts=4,
-            hidden_size=32,
-            d_ff=16,
-            expert_capacity=40,
-        )
-        set_seed(0)
-        mask = ops.zeros(
-            (self.batch_size * self.sequence_length), dtype=mindspore.bool_
-        )
-        logits = ops.rand((self.batch_size * self.sequence_length, 4))
-
-        set_seed(0)
-        config.second_expert_policy = "random"
-        router = NllbMoeTop2Router(config)
-        top_1_mask, router_probs = router.route_tokens(logits, padding_mask=mask)
-
-        set_seed(0)
-        config.second_expert_policy = "sampling"
-        router = NllbMoeTop2Router(config)
-        top_1_mask_sp, router_probs_sp = router.route_tokens(logits, padding_mask=mask)
-
-        set_seed(0)
-        config.second_expert_policy = "all"
-        router = NllbMoeTop2Router(config)
-        top_1_mask_all, router_probs_all = router.route_tokens(
-            logits, padding_mask=mask
-        )
-
-        # fmt: off
-        EXPECTED_ROUTER_ALL = mindspore.Tensor([[0.3902, 0.0000, 0.0000, 0.6098], [0.0000, 0.0000, 0.7770, 0.2230], [0.0000, 0.0000, 0.2726, 0.7274], [0.4221, 0.0000, 0.5779, 0.0000], [0.0000, 0.0000, 0.7810, 0.2190], [0.5518, 0.4482, 0.0000, 0.0000], [0.0000, 0.4060, 0.5940, 0.0000], [0.7340, 0.0000, 0.0000, 0.2660], [0.4778, 0.5222, 0.0000, 0.0000], [0.0000, 0.3984, 0.0000, 0.6016], [0.0000, 0.0548, 0.9452, 0.0000], [0.6796, 0.0000, 0.0000, 0.3204], [0.0700, 0.0000, 0.9300, 0.0000], [0.1854, 0.0000, 0.8146, 0.0000], [0.6775, 0.3225, 0.0000, 0.0000], [0.0000, 0.0000, 0.5027, 0.4973], [0.0000, 0.6577, 0.0000, 0.3423], [0.0000, 0.7767, 0.0000, 0.2233], [0.1944, 0.8056, 0.0000, 0.0000], [0.0000, 0.3073, 0.0000, 0.6927], [0.0000, 0.5655, 0.4345, 0.0000], [0.5791, 0.0000, 0.0000, 0.4209], [0.0440, 0.0000, 0.9560, 0.0000], [0.0083, 0.9917, 0.0000, 0.0000], [0.0000, 0.8395, 0.0000, 0.1605], [0.0000, 0.1458, 0.0000, 0.8542], [0.0000, 0.8534, 0.1466, 0.0000], [0.4938, 0.0000, 0.0000, 0.5062], [0.1329, 0.8671, 0.0000, 0.0000], [0.3058, 0.0000, 0.6942, 0.0000], [0.4458, 0.0000, 0.0000, 0.5542], [0.9053, 0.0947, 0.0000, 0.0000], [0.0000, 0.7563, 0.2437, 0.0000], [0.0000, 0.0000, 0.4096, 0.5904], [0.4551, 0.0000, 0.0000, 0.5449], [0.8502, 0.1498, 0.0000, 0.0000], [0.0000, 0.6312, 0.3688, 0.0000], [0.8920, 0.0000, 0.0000, 0.1080], [0.1913, 0.0000, 0.0000, 0.8087], [0.2491, 0.7509, 0.0000, 0.0000]])
-        EXPECTED_ROUTER_SP = mindspore.Tensor([[0.0000, 0.6539, 0.0000, 0.3461], [0.0000, 0.0000, 0.3998, 0.6002], [0.0000, 0.5574, 0.0000, 0.4426], [0.0000, 0.0000, 0.4441, 0.5559], [0.0000, 0.6545, 0.3455, 0.0000], [0.4419, 0.5581, 0.0000, 0.0000], [0.0000, 0.4014, 0.5986, 0.0000], [0.3215, 0.0000, 0.0000, 0.6785], [0.4765, 0.5235, 0.0000, 0.0000], [0.0000, 0.5467, 0.0000, 0.4533], [0.0000, 0.4156, 0.5844, 0.0000], [0.3370, 0.0000, 0.6630, 0.0000], [0.0000, 0.0000, 0.4558, 0.5442], [0.4659, 0.0000, 0.5341, 0.0000], [0.6179, 0.3821, 0.0000, 0.0000], [0.6277, 0.0000, 0.3723, 0.0000], [0.5836, 0.4164, 0.0000, 0.0000], [0.0000, 0.6600, 0.0000, 0.3400], [0.0000, 0.4933, 0.0000, 0.5067], [0.6016, 0.0000, 0.0000, 0.3984], [0.0000, 0.5160, 0.4840, 0.0000], [0.5799, 0.0000, 0.0000, 0.4201], [0.0000, 0.0000, 0.4826, 0.5174], [0.5426, 0.4574, 0.0000, 0.0000], [0.5362, 0.4638, 0.0000, 0.0000], [0.6448, 0.0000, 0.0000, 0.3552], [0.0000, 0.5909, 0.4091, 0.0000], [0.4196, 0.0000, 0.0000, 0.5804], [0.3191, 0.6809, 0.0000, 0.0000], [0.0000, 0.0000, 0.4886, 0.5114], [0.4899, 0.0000, 0.0000, 0.5101], [0.4123, 0.0000, 0.5877, 0.0000], [0.0000, 0.3736, 0.0000, 0.6264], [0.0000, 0.0000, 0.6009, 0.3991], [0.4246, 0.0000, 0.0000, 0.5754], [0.4997, 0.0000, 0.5003, 0.0000], [0.0000, 0.3595, 0.6405, 0.0000], [0.5433, 0.0000, 0.0000, 0.4567], [0.0000, 0.6806, 0.0000, 0.3194], [0.6689, 0.3311, 0.0000, 0.0000]])
-        EXPECTED_ROUTER = mindspore.Tensor([[0.4324, 0.5676, 0.0000, 0.0000], [0.0000, 0.4348, 0.0000, 0.5652], [0.4559, 0.5441, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000, 1.0000], [0.4744, 0.5256, 0.0000, 0.0000], [0.0000, 0.5103, 0.0000, 0.4897], [0.0000, 0.0000, 1.0000, 0.0000], [0.0000, 0.0000, 0.0000, 1.0000], [0.0000, 1.0000, 0.0000, 0.0000], [0.0000, 0.5467, 0.0000, 0.4533], [0.0000, 0.0000, 1.0000, 0.0000], [0.0000, 0.0000, 1.0000, 0.0000], [0.0000, 0.0000, 0.0000, 1.0000], [0.0000, 0.0000, 1.0000, 0.0000], [1.0000, 0.0000, 0.0000, 0.0000], [0.5063, 0.4937, 0.0000, 0.0000], [0.5396, 0.0000, 0.0000, 0.4604], [0.4576, 0.5424, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000, 1.0000], [0.5134, 0.0000, 0.4866, 0.0000], [0.0000, 0.5160, 0.4840, 0.0000], [0.5439, 0.0000, 0.4561, 0.0000], [0.4849, 0.0000, 0.0000, 0.5151], [0.5426, 0.4574, 0.0000, 0.0000], [0.5362, 0.4638, 0.0000, 0.0000], [1.0000, 0.0000, 0.0000, 0.0000], [0.0000, 1.0000, 0.0000, 0.0000], [0.0000, 0.4448, 0.0000, 0.5552], [0.0000, 1.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.4886, 0.5114], [0.4899, 0.0000, 0.0000, 0.5101], [0.0000, 0.0000, 0.5296, 0.4704], [0.0000, 0.0000, 0.4469, 0.5531], [0.0000, 0.4053, 0.5947, 0.0000], [0.0000, 0.0000, 0.4460, 0.5540], [0.4997, 0.0000, 0.5003, 0.0000], [0.0000, 0.0000, 0.5851, 0.4149], [1.0000, 0.0000, 0.0000, 0.0000], [0.0000, 0.5010, 0.4990, 0.0000], [1.0000, 0.0000, 0.0000, 0.0000]])
-
-        EXPECTED_TOP_1_ALL = mindspore.Tensor([[0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 0, 1], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0]])
-        EXPECTED_TOP_1_SP = mindspore.Tensor([[0, 1, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0]])
-        # `sampling` and `random` do not affect the mask of the top_1 router
-        # fmt: on
-
-        self.assertTrue(
-            ops.allclose(router_probs_all, EXPECTED_ROUTER_ALL, rtol=1e-4, atol=1e-4)
-        )
-        self.assertTrue(
-            ops.allclose(router_probs_sp, EXPECTED_ROUTER_SP, rtol=1e-4, atol=1e-4)
-        )
-        self.assertTrue(
-            ops.allclose(router_probs, EXPECTED_ROUTER, rtol=1e-4, atol=1e-4)
-        )
-        self.assertTrue(
-            ops.allclose(top_1_mask_all, EXPECTED_TOP_1_ALL, rtol=1e-4, atol=1e-4)
-        )
-        self.assertTrue(
-            ops.allclose(top_1_mask_sp, EXPECTED_TOP_1_SP, rtol=1e-4, atol=1e-4)
-        )
-        self.assertTrue(
-            ops.allclose(top_1_mask, EXPECTED_TOP_1_SP, rtol=1e-4, atol=1e-4)
-        )
-
diff --git a/tests/transformers/models/nougat/__init__.py b/tests/transformers/models/nougat/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/nougat/test_image_processing_nougat.py b/tests/transformers/models/nougat/test_image_processing_nougat.py
deleted file mode 100644
index ac2710b2b..000000000
--- a/tests/transformers/models/nougat/test_image_processing_nougat.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-from huggingface_hub import hf_hub_download
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils import (
-    cached_property,
-    is_mindspore_available,
-    is_vision_available,
-)
-
-from ...test_image_processing_common import (
-    ImageProcessingTestMixin,
-    prepare_image_inputs,
-)
-
-
-if is_mindspore_available():
-    import mindspore
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import NougatImageProcessor
-
-
-class NougatImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_crop_margin=True,
-        do_resize=True,
-        size=None,
-        do_thumbnail=True,
-        do_align_long_axis: bool = False,
-        do_pad=True,
-        do_normalize: bool = True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"height": 20, "width": 20}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_crop_margin = do_crop_margin
-        self.do_resize = do_resize
-        self.size = size
-        self.do_thumbnail = do_thumbnail
-        self.do_align_long_axis = do_align_long_axis
-        self.do_pad = do_pad
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_crop_margin": self.do_crop_margin,
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_thumbnail": self.do_thumbnail,
-            "do_align_long_axis": self.do_align_long_axis,
-            "do_pad": self.do_pad,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.size["height"], self.size["width"]
-
-    def prepare_dummy_image(self):
-        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa",
-            filename="nougat_pdf.png",
-            repo_type="dataset",
-        )
-        image = Image.open(filepath).convert("RGB")
-        return image
-
-    def prepare_image_inputs(
-        self, equal_resolution=False, numpify=False, torchify=False
-    ):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class NougatImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = NougatImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = NougatImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    @cached_property
-    def image_processor(self):
-        return self.image_processing_class(**self.image_processor_dict)
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict
-        )
-        self.assertEqual(image_processor.size, {"height": 20, "width": 20})
-
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42
-        )
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
-
-    def test_expected_output(self):
-        dummy_image = self.image_processor_tester.prepare_dummy_image()
-        image_processor = self.image_processor
-        inputs = image_processor(dummy_image, return_tensors="ms")
-        self.assertTrue(
-            np.allclose(
-                inputs["pixel_values"].mean().asnumpy(),
-                mindspore.tensor(0.4906).asnumpy(),
-                atol=1e-3,
-                rtol=1e-3,
-            )
-        )
-
-    def test_crop_margin_all_white(self):
-        image = np.uint8(np.ones((100, 100, 3)) * 255)
-        image_processor = self.image_processor
-        cropped_image = image_processor.crop_margin(image)
-        self.assertTrue(np.array_equal(image, cropped_image))
-
-    def test_crop_margin_centered_black_square(self):
-        image = np.ones((100, 100, 3), dtype=np.uint8) * 255
-        image[45:55, 45:55, :] = 0
-        image_processor = self.image_processor
-        cropped_image = image_processor.crop_margin(image)
-        expected_cropped = image[45:55, 45:55, :]
-        self.assertTrue(np.array_equal(expected_cropped, cropped_image))
-
-    def test_align_long_axis_no_rotation(self):
-        image = np.uint8(np.ones((100, 200, 3)) * 255)
-        image_processor = self.image_processor
-        size = {"height": 200, "width": 300}
-        aligned_image = image_processor.align_long_axis(image, size)
-        self.assertEqual(image.shape, aligned_image.shape)
-
-    def test_align_long_axis_with_rotation(self):
-        image = np.uint8(np.ones((200, 100, 3)) * 255)
-        image_processor = self.image_processor
-        size = {"height": 300, "width": 200}
-        aligned_image = image_processor.align_long_axis(image, size)
-        self.assertEqual((200, 100, 3), aligned_image.shape)
-
-    def test_align_long_axis_data_format(self):
-        image = np.uint8(np.ones((100, 200, 3)) * 255)
-        data_format = "channels_first"
-        size = {"height": 200, "width": 300}
-        image_processor = self.image_processor
-        aligned_image = image_processor.align_long_axis(
-            image, size, data_format=data_format
-        )
-        self.assertEqual((3, 100, 200), aligned_image.shape)
-
-    def prepare_dummy_np_image(self):
-        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa",
-            filename="nougat_pdf.png",
-            repo_type="dataset",
-        )
-        image = Image.open(filepath).convert("RGB")
-        return np.array(image)
-
-    def test_crop_margin_equality_cv2_python(self):
-        image = self.prepare_dummy_np_image()
-        image_processor = self.image_processor
-        image_cropped_python = image_processor.crop_margin(image)
-
-        self.assertEqual(image_cropped_python.shape, (850, 685, 3))
-        self.assertEqual(image_cropped_python.mean(), 237.43881150708458)
diff --git a/tests/transformers/models/nougat/test_modeling_nougat.py b/tests/transformers/models/nougat/test_modeling_nougat.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/nystromformer/__init__.py b/tests/transformers/models/nystromformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/nystromformer/test_modeling_nystromformer.py b/tests/transformers/models/nystromformer/test_modeling_nystromformer.py
deleted file mode 100644
index 09bfe5042..000000000
--- a/tests/transformers/models/nystromformer/test_modeling_nystromformer.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Nystromformer model."""
-
-import unittest
-
-import numpy as np
-import mindspore
-from mindspore import Tensor
-from mindnlp.transformers import NystromformerConfig
-from mindnlp.utils.testing_utils import slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-from mindnlp.transformers import (
-    NystromformerForMaskedLM,
-    NystromformerForMultipleChoice,
-    NystromformerForQuestionAnswering,
-    NystromformerForSequenceClassification,
-    NystromformerForTokenClassification,
-    NystromformerModel,
-)
-
-
-class NystromformerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return NystromformerConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NystromformerModel(config=config)
-        
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NystromformerForMaskedLM(config=config)
-        
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = NystromformerForQuestionAnswering(config=config)
-        
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = NystromformerForSequenceClassification(config)
-        
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = NystromformerForTokenClassification(config=config)
-        
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = NystromformerForMultipleChoice(config=config)
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-# class NystromformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-class NystromformerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-            # NystromformerModel,
-            NystromformerForMaskedLM,
-            NystromformerForMultipleChoice,
-            NystromformerForQuestionAnswering,
-            NystromformerForSequenceClassification,
-            NystromformerForTokenClassification,
-        )
-    pipeline_model_mapping ={
-            "feature-extraction": NystromformerModel,
-            "fill-mask": NystromformerForMaskedLM,
-            "question-answering": NystromformerForQuestionAnswering,
-            "text-classification": NystromformerForSequenceClassification,
-            "token-classification": NystromformerForTokenClassification,
-            "zero-shot": NystromformerForSequenceClassification,
-        }
-    test_pruning = False
-    test_headmasking = False
-
-    def setUp(self):
-        self.model_tester = NystromformerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=NystromformerConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # def test_model(self):
-    #     config_and_inputs = self.model_tester.prepare_config_and_inputs()
-    #     self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # def test_model_various_embeddings(self):
-    #     config_and_inputs = self.model_tester.prepare_config_and_inputs()
-    #     for type in ["absolute", "relative_key", "relative_key_query"]:
-    #         config_and_inputs[0].position_embedding_type = type
-    #         self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "uw-madison/nystromformer-512"
-        model = NystromformerModel.from_pretrained(model_name,from_pt=True)
-        self.assertIsNotNone(model)
-
-
-class NystromformerModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head(self):
-        model = NystromformerModel.from_pretrained("uw-madison/nystromformer-512",from_pt=True)
-        input_ids = Tensor([[0, 1, 2, 3, 4, 5]])
-
-        output = model(input_ids)[0]
-
-        expected_shape = (1, 6, 768)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = Tensor(
-            [[[-0.4532, -0.0936, 0.5137], [-0.2676, 0.0628, 0.6186], [-0.3629, -0.1726, 0.4716]]]
-        )
-
-        self.assertTrue(Tensor(np.allclose(Tensor(output[:, :3, :3]).asnumpy(), expected_slice.asnumpy(), atol=1e-4)))
diff --git a/tests/transformers/models/olmo/__init__.py b/tests/transformers/models/olmo/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/olmo/test_modeling_olmo.py b/tests/transformers/models/olmo/test_modeling_olmo.py
deleted file mode 100644
index 7803f33bb..000000000
--- a/tests/transformers/models/olmo/test_modeling_olmo.py
+++ /dev/null
@@ -1,439 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch OLMo model."""
-
-import unittest
-
-from parameterized import parameterized
-
-from mindnlp.engine import set_seed
-from mindnlp.transformers import OlmoConfig
-from mindnlp.transformers.models.auto.tokenization_auto import AutoTokenizer
-from mindnlp.transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
-from mindnlp.utils.testing_utils import (
-    is_flaky,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-    is_mindspore_available
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        OlmoForCausalLM,
-        OlmoModel,
-    )
-
-
-class OlmoModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="silu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ops.tril(ops.ones(self.batch_size, self.seq_length))
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return OlmoConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = OlmoModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = OlmoModel(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = OlmoForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = OlmoForCausalLM(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class OlmoModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (OlmoModel, OlmoForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (OlmoForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": OlmoModel,
-            "text-generation": OlmoForCausalLM,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    fx_compatible = False
-
-    # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
-    # This is because we are hitting edge cases with the causal_mask buffer
-    model_split_percents = [0.5, 0.7, 0.8]
-
-    def setUp(self):
-        self.model_tester = OlmoModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=OlmoConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="OLMo does not support head pruning.")
-    def test_headmasking(self):
-        pass
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="OLMo buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @parameterized.expand([("linear",), ("dynamic",)])
-    def test_model_rope_scaling(self, scaling_type):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        short_input = ids_tensor([1, 10], config.vocab_size)
-        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        original_model = OlmoModel(config)
-        original_model.eval()
-        original_short_output = original_model(short_input).last_hidden_state
-        original_long_output = original_model(long_input).last_hidden_state
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
-        scaled_model = OlmoModel(config)
-        scaled_model.eval()
-        scaled_short_output = scaled_model(short_input).last_hidden_state
-        scaled_long_output = scaled_model(long_input).last_hidden_state
-
-        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
-        # maximum sequence length, so the outputs for the short input should match.
-        if scaling_type == "dynamic":
-            self.assertTrue(ops.allclose(original_short_output, scaled_short_output, atol=1e-5))
-        else:
-            self.assertFalse(ops.allclose(original_short_output, scaled_short_output, atol=1e-5))
-
-        # The output should be different for long inputs
-        self.assertFalse(ops.allclose(original_long_output, scaled_long_output, atol=1e-5))
-
-
-@require_mindspore
-class OlmoIntegrationTest(unittest.TestCase):
-    @slow
-    def test_model_1b_logits(self):
-        input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]]
-        model = OlmoForCausalLM.from_pretrained("allenai/OLMo-1B-hf", device_map="auto")
-        out = model(mindspore.tensor(input_ids)).logits
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = mindspore.tensor([[2.2869, 0.3315, 0.9876, 1.4146, 1.8804, 2.0430, 1.7055, 1.2065]])
-        assert ops.allclose(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
-        # slicing logits[0, 0, 0:30]
-        EXPECTED_SLICE = mindspore.tensor([2.5551, -1.1230, 11.0510, 12.4977, 7.9651, 7.2342, 6.1885, 7.8340, 9.9847, 12.6695, 12.2345, 10.7970, 8.4749, 14.2483, 12.9588, 13.9233, 11.0496, 5.5749, 7.4466, 7.7914, 6.8440, 5.8951, 4.8180, 4.1935, 4.5216, 4.7256, 3.9553, 12.2870, 12.4990, 8.1591])  # fmt: skip
-        assert ops.allclose(out[0, 0, :30], EXPECTED_SLICE, atol=1e-2, rtol=1e-2)
-
-    @slow
-    def test_model_7b_logits(self):
-        input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]]
-        model = OlmoForCausalLM.from_pretrained("allenai/OLMo-7B-hf", device_map="auto")
-        out = model(mindspore.tensor(input_ids)).logits
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = mindspore.tensor([[0.0271, 0.0249, -0.0578, -0.0870, 0.0167, 0.0710, 0.1002, 0.0677]])
-        assert ops.allclose(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
-        # slicing logits[0, 0, 0:30]
-        EXPECTED_SLICE = mindspore.tensor([-1.7433, -1.6685, 7.4941, 6.1506, 0.1364, -0.1127, 1.3224, 4.5458, 4.2068, 5.8296, 7.4723, 2.7925, 3.1245, 10.8872, 10.0758, 10.6717, 7.0945, 1.2398, 3.6766, 4.2365, 2.5655, 2.2222, 1.7418, 0.5223, 0.7753, 1.0938, 0.6723, 6.2522, 6.2264, 1.8105])  # fmt: skip
-        assert ops.allclose(out[0, 0, :30], EXPECTED_SLICE, atol=1e-2, rtol=1e-2)
-
-    @slow
-    def test_model_7b_twin_2t_logits(self):
-        input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]]
-        model = OlmoForCausalLM.from_pretrained("allenai/OLMo-7B-Twin-2T-hf", device_map="auto")
-        out = model(mindspore.tensor(input_ids)).logits
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = mindspore.tensor([[-0.3636, -0.3825, -0.4800, -0.3696, -0.8388, -0.9737, -0.9849, -0.8356]])
-        assert ops.allclose(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
-        # slicing logits[0, 0, 0:30]
-        EXPECTED_SLICE = mindspore.tensor([-2.0833, -1.9234, 8.7312, 7.8049, 1.0372, 0.8941, 3.1548, 1.8502, 5.5511, 5.5793, 8.1166, 4.5906, 1.8691, 11.6377, 8.9858, 11.6447, 7.4549, 1.4725, 2.8399, 2.7568, 1.4011, 1.6958, 0.5572, 0.5231, 0.3068, 0.5364, 0.6769, 7.9636, 8.2379, 1.7950])  # fmt: skip
-        assert ops.allclose(out[0, 0, :30], EXPECTED_SLICE, atol=1e-2, rtol=1e-2)
-
-    @slow
-    def test_model_7b_greedy_generation(self):
-        EXPECTED_TEXT_COMPLETION = """Simply put, the theory of relativity states that \nthe speed of light is the same for all observers.\n\nThe theory of relativity is a theory of physics that describes the \nmovement of objects in space and time.\n\nThe theory of relativity is a theory of physics that describes the \nmovement of objects in space and time.\n\n"""
-        prompt = "Simply put, the theory of relativity states that "
-        tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-7B-hf", device_map="auto")
-        input_ids = tokenizer.encode(prompt, return_tensors="ms")
-        model = OlmoForCausalLM.from_pretrained("allenai/OLMo-7B-hf", device_map="auto")
-
-        # greedy generation outputs
-        generated_ids = model.generate(input_ids, max_new_tokens=64, top_p=None, temperature=1, do_sample=False)
-        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
-
-    @require_tokenizers
-    def test_fast_special_tokens(self):
-        fast_tokenizer = GPTNeoXTokenizerFast.from_pretrained("allenai/OLMo-1B-hf")
-
-        original_add_eos_token = fast_tokenizer.add_eos_token
-
-        fast_tokenizer.add_eos_token = False
-        fast = fast_tokenizer.encode("A sample test")
-        self.assertEqual(fast, [34, 3410, 1071])
-
-        fast_tokenizer.add_eos_token = True
-        fast = fast_tokenizer.encode("A sample test")
-        self.assertEqual(fast, [34, 3410, 1071, 50279])
-
-        fast_tokenizer.add_eos_token = original_add_eos_token
-
-    @require_tokenizers
-    def test_simple_encode_decode(self):
-        rust_tokenizer = GPTNeoXTokenizerFast.from_pretrained("allenai/OLMo-1B-hf")
-
-        self.assertEqual(rust_tokenizer.encode("This is a test"), [1552, 310, 247, 1071])
-        self.assertEqual(rust_tokenizer.decode([1552, 310, 247, 1071], skip_special_tokens=True), "This is a test")
-
-        # bytefallback showcase
-        self.assertEqual(rust_tokenizer.encode("生活的真谛是"), [20025, 46549, 5225, 48561, 33656, 238, 12105])  # fmt: skip
-        self.assertEqual(
-            rust_tokenizer.decode([20025, 46549, 5225, 48561, 33656, 238, 12105], skip_special_tokens=True),
-            "生活的真谛是",
-        )
-
-        # Inner spaces showcase
-        self.assertEqual(rust_tokenizer.encode("Hi  Hello"), [12764, 50276, 12092])
-        self.assertEqual(rust_tokenizer.decode([12764, 50276, 12092], skip_special_tokens=True), "Hi  Hello")
-
-        self.assertEqual(rust_tokenizer.encode("Hi   Hello"), [12764, 50275, 12092])
-        self.assertEqual(rust_tokenizer.decode([12764, 50275, 12092], skip_special_tokens=True), "Hi   Hello")
-
-        self.assertEqual(rust_tokenizer.encode(""), [])
-
-        self.assertEqual(rust_tokenizer.encode(" "), [209])
-
-        self.assertEqual(rust_tokenizer.encode("  "), [50276])
-
-        self.assertEqual(rust_tokenizer.encode(" Hello"), [24387])
\ No newline at end of file
diff --git a/tests/transformers/models/oneformer/__init__.py b/tests/transformers/models/oneformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/oneformer/test_image_processing_oneformer.py b/tests/transformers/models/oneformer/test_image_processing_oneformer.py
deleted file mode 100644
index 22e5a97ce..000000000
--- a/tests/transformers/models/oneformer/test_image_processing_oneformer.py
+++ /dev/null
@@ -1,351 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch OneFormer processor."""
-
-import json
-import os
-import tempfile
-import unittest
-
-import numpy as np
-from mindspore import ops
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_mindspore_available():
-    if is_vision_available():
-        from mindnlp.transformers import OneFormerImageProcessor
-        from mindnlp.transformers.models.oneformer.image_processing_oneformer import binary_mask_to_rle, prepare_metadata
-        from mindnlp.transformers.models.oneformer.modeling_oneformer import OneFormerForUniversalSegmentationOutput
-
-if is_vision_available():
-    from PIL import Image
-
-
-class OneFormerImageProcessorTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        size=None,
-        do_resize=True,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        num_labels=10,
-        do_reduce_labels=False,
-        ignore_index=255,
-        repo_path="shi-labs/oneformer_demo",
-        class_info_file="ade20k_panoptic.json",
-        num_text=10,
-    ):
-        super().__init__()
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = {"shortest_edge": 32, "longest_edge": 1333} if size is None else size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.class_info_file = class_info_file
-        self.num_text = num_text
-        self.repo_path = repo_path
-
-        # for the post_process_functions
-        self.batch_size = 2
-        self.num_queries = 10
-        self.num_classes = 10
-        self.height = 3
-        self.width = 4
-        self.num_labels = num_labels
-        self.do_reduce_labels = do_reduce_labels
-        self.ignore_index = ignore_index
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "num_labels": self.num_labels,
-            "do_reduce_labels": self.do_reduce_labels,
-            "ignore_index": self.ignore_index,
-            "class_info_file": self.class_info_file,
-            "num_text": self.num_text,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to OneFormerImageProcessor,
-        assuming do_resize is set to True with a scalar size.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-    def get_fake_oneformer_outputs(self):
-        return OneFormerForUniversalSegmentationOutput(
-            # +1 for null class
-            class_queries_logits=ops.randn((self.batch_size, self.num_queries, self.num_classes + 1)),
-            masks_queries_logits=ops.randn((self.batch_size, self.num_queries, self.height, self.width)),
-        )
-
-    def expected_output_image_shape(self, images):
-        height, width = self.get_expected_values(images, batched=True)
-        return self.num_channels, height, width
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class OneFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = OneFormerImageProcessor if (is_vision_available() and is_mindspore_available()) else None
-    # only for test_image_processing_common.test_image_proc_to_json_string
-    image_processing_class = image_processing_class
-
-    def setUp(self):
-        self.image_processor_tester = OneFormerImageProcessorTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_proc_properties(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processor, "image_mean"))
-        self.assertTrue(hasattr(image_processor, "image_std"))
-        self.assertTrue(hasattr(image_processor, "do_normalize"))
-        self.assertTrue(hasattr(image_processor, "do_resize"))
-        self.assertTrue(hasattr(image_processor, "size"))
-        self.assertTrue(hasattr(image_processor, "ignore_index"))
-        self.assertTrue(hasattr(image_processor, "class_info_file"))
-        self.assertTrue(hasattr(image_processor, "num_text"))
-        self.assertTrue(hasattr(image_processor, "repo_path"))
-        self.assertTrue(hasattr(image_processor, "metadata"))
-        self.assertTrue(hasattr(image_processor, "do_reduce_labels"))
-
-    def comm_get_image_processor_inputs(
-        self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np"
-    ):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # prepare image and target
-        num_labels = self.image_processor_tester.num_labels
-        annotations = None
-        instance_id_to_semantic_id = None
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        if with_segmentation_maps:
-            high = num_labels
-            if is_instance_map:
-                labels_expanded = list(range(num_labels)) * 2
-                instance_id_to_semantic_id = dict(enumerate(labels_expanded))
-            annotations = [
-                np.random.randint(0, high * 2, (img.size[1], img.size[0])).astype(np.uint8) for img in image_inputs
-            ]
-            if segmentation_type == "pil":
-                annotations = [Image.fromarray(annotation) for annotation in annotations]
-
-        inputs = image_processor(
-            image_inputs,
-            ["semantic"] * len(image_inputs),
-            annotations,
-            return_tensors="ms",
-            instance_id_to_semantic_id=instance_id_to_semantic_id,
-            pad_and_return_pixel_mask=True,
-        )
-
-        return inputs
-
-    def test_init_without_params(self):
-        pass
-
-    def test_call_with_segmentation_maps(self):
-        def common(is_instance_map=False, segmentation_type=None):
-            inputs = self.comm_get_image_processor_inputs(
-                with_segmentation_maps=True, is_instance_map=is_instance_map, segmentation_type=segmentation_type
-            )
-
-            mask_labels = inputs["mask_labels"]
-            class_labels = inputs["class_labels"]
-            pixel_values = inputs["pixel_values"]
-            text_inputs = inputs["text_inputs"]
-
-            # check the batch_size
-            for mask_label, class_label, text_input in zip(mask_labels, class_labels, text_inputs):
-                self.assertEqual(mask_label.shape[0], class_label.shape[0])
-                # this ensure padding has happened
-                self.assertEqual(mask_label.shape[1:], pixel_values.shape[2:])
-                self.assertEqual(len(text_input), self.image_processor_tester.num_text)
-
-        common()
-        common(is_instance_map=True)
-        common(is_instance_map=False, segmentation_type="pil")
-        common(is_instance_map=True, segmentation_type="pil")
-
-    def test_binary_mask_to_rle(self):
-        fake_binary_mask = np.zeros((20, 50))
-        fake_binary_mask[0, 20:] = 1
-        fake_binary_mask[1, :15] = 1
-        fake_binary_mask[5, :10] = 1
-
-        rle = binary_mask_to_rle(fake_binary_mask)
-        self.assertEqual(len(rle), 4)
-        self.assertEqual(rle[0], 21)
-        self.assertEqual(rle[1], 45)
-
-    def test_post_process_semantic_segmentation(self):
-        fature_extractor = self.image_processing_class(
-            num_labels=self.image_processor_tester.num_classes,
-            max_seq_length=77,
-            task_seq_length=77,
-            class_info_file="ade20k_panoptic.json",
-            num_text=self.image_processor_tester.num_text,
-            repo_path="shi-labs/oneformer_demo",
-        )
-        outputs = self.image_processor_tester.get_fake_oneformer_outputs()
-
-        segmentation = fature_extractor.post_process_semantic_segmentation(outputs)
-
-        self.assertEqual(len(segmentation), self.image_processor_tester.batch_size)
-        self.assertEqual(
-            segmentation[0].shape,
-            (
-                self.image_processor_tester.height,
-                self.image_processor_tester.width,
-            ),
-        )
-
-        target_sizes = [(1, 4) for i in range(self.image_processor_tester.batch_size)]
-        segmentation = fature_extractor.post_process_semantic_segmentation(outputs, target_sizes=target_sizes)
-
-        self.assertEqual(segmentation[0].shape, target_sizes[0])
-
-    def test_post_process_instance_segmentation(self):
-        image_processor = self.image_processing_class(
-            num_labels=self.image_processor_tester.num_classes,
-            max_seq_length=77,
-            task_seq_length=77,
-            class_info_file="ade20k_panoptic.json",
-            num_text=self.image_processor_tester.num_text,
-            repo_path="shi-labs/oneformer_demo",
-        )
-        outputs = self.image_processor_tester.get_fake_oneformer_outputs()
-        segmentation = image_processor.post_process_instance_segmentation(outputs, threshold=0)
-
-        self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
-        for el in segmentation:
-            self.assertTrue("segmentation" in el)
-            self.assertTrue("segments_info" in el)
-            self.assertEqual(type(el["segments_info"]), list)
-            self.assertEqual(
-                el["segmentation"].shape, (self.image_processor_tester.height, self.image_processor_tester.width)
-            )
-
-        segmentation_with_opts = image_processor.post_process_instance_segmentation(
-            outputs,
-            threshold=0,
-            target_sizes=[(1, 4) for _ in range(self.image_processor_tester.batch_size)],
-            task_type="panoptic",
-        )
-        self.assertTrue(len(segmentation_with_opts) == self.image_processor_tester.batch_size)
-        for el in segmentation_with_opts:
-            self.assertTrue("segmentation" in el)
-            self.assertTrue("segments_info" in el)
-            self.assertEqual(type(el["segments_info"]), list)
-            self.assertEqual(el["segmentation"].shape, (1, 4))
-
-    def test_post_process_panoptic_segmentation(self):
-        image_processor = self.image_processing_class(
-            num_labels=self.image_processor_tester.num_classes,
-            max_seq_length=77,
-            task_seq_length=77,
-            class_info_file="ade20k_panoptic.json",
-            num_text=self.image_processor_tester.num_text,
-            repo_path="shi-labs/oneformer_demo",
-        )
-        outputs = self.image_processor_tester.get_fake_oneformer_outputs()
-        segmentation = image_processor.post_process_panoptic_segmentation(outputs, threshold=0)
-
-        self.assertTrue(len(segmentation) == self.image_processor_tester.batch_size)
-        for el in segmentation:
-            self.assertTrue("segmentation" in el)
-            self.assertTrue("segments_info" in el)
-            self.assertEqual(type(el["segments_info"]), list)
-            self.assertEqual(
-                el["segmentation"].shape, (self.image_processor_tester.height, self.image_processor_tester.width)
-            )
-
-    def test_can_load_with_local_metadata(self):
-        # Create a temporary json file
-        class_info = {
-            "0": {"isthing": 0, "name": "foo"},
-            "1": {"isthing": 0, "name": "bar"},
-            "2": {"isthing": 1, "name": "baz"},
-        }
-        metadata = prepare_metadata(class_info)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            metadata_path = os.path.join(tmpdirname, "metadata.json")
-            with open(metadata_path, "w") as f:
-                json.dump(class_info, f)
-
-            config_dict = self.image_processor_dict
-            config_dict["class_info_file"] = metadata_path
-            config_dict["repo_path"] = tmpdirname
-            image_processor = self.image_processing_class(**config_dict)
-
-        self.assertEqual(image_processor.metadata, metadata)
diff --git a/tests/transformers/models/oneformer/test_modeling_oneformer.py b/tests/transformers/models/oneformer/test_modeling_oneformer.py
deleted file mode 100644
index fb4cc0b2b..000000000
--- a/tests/transformers/models/oneformer/test_modeling_oneformer.py
+++ /dev/null
@@ -1,518 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore OneFormer model."""
-
-import copy
-import inspect
-import unittest
-
-import numpy as np
-
-from ...test_modeling_common import floats_tensor
-from mindnlp.transformers import OneFormerConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import OneFormerForUniversalSegmentation, OneFormerModel
-
-    if is_vision_available():
-        from mindnlp.transformers import OneFormerProcessor
-
-if is_vision_available():
-    from PIL import Image
-
-
-def _config_zero_init(config):
-    configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
-            setattr(configs_no_init, key, 1e-10)
-    return configs_no_init
-
-
-class OneFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        is_training=True,
-        vocab_size=99,
-        use_auxiliary_loss=False,
-        num_queries=10,
-        num_channels=3,
-        min_size=32 * 8,
-        max_size=32 * 8,
-        num_labels=4,
-        hidden_dim=64,
-        sequence_length=77,
-        n_ctx=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.is_training = is_training
-        self.vocab_size = vocab_size
-        self.use_auxiliary_loss = use_auxiliary_loss
-        self.num_queries = num_queries
-        self.num_channels = num_channels
-        self.min_size = min_size
-        self.max_size = max_size
-        self.num_labels = num_labels
-        self.hidden_dim = hidden_dim
-        self.sequence_length = sequence_length
-        self.n_ctx = n_ctx
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size])
-
-        task_inputs = (
-            ops.randint(high=self.vocab_size, size=(self.batch_size, self.sequence_length)).long()
-        )
-
-        pixel_mask = ops.ones([self.batch_size, self.min_size, self.max_size])
-
-        text_inputs = (
-            ops.randint(
-                high=self.vocab_size, size=(self.batch_size, self.num_queries - self.n_ctx, self.sequence_length)
-            )
-            .long()
-        )
-
-        mask_labels = (
-            ops.rand([self.batch_size, self.num_labels, self.min_size, self.max_size]) > 0.5
-        ).float()
-        class_labels = (ops.rand((self.batch_size, self.num_labels)) > 0.5).long()
-
-        config = self.get_config()
-        return config, pixel_values, task_inputs, text_inputs, pixel_mask, mask_labels, class_labels
-
-    def get_config(self):
-        config = OneFormerConfig(
-            text_encoder_vocab_size=self.vocab_size,
-            hidden_size=self.hidden_dim,
-            num_queries=self.num_queries,
-            num_labels=self.num_labels,
-            encoder_feedforward_dim=32,
-            dim_feedforward=64,
-            encoder_layers=2,
-            decoder_layers=2,
-        )
-
-        config.backbone_config.embed_dim = 16
-        config.backbone_config.depths = [1, 1, 1, 1]
-        config.backbone_config.hidden_size = 16
-        config.backbone_config.num_channels = self.num_channels
-        config.backbone_config.num_heads = [1, 1, 2, 2]
-        config.backbone = None
-
-        config.hidden_dim = self.hidden_dim
-        config.mask_dim = self.hidden_dim
-        config.conv_dim = self.hidden_dim
-
-        config.text_encoder_width = self.hidden_dim
-        config.task_seq_len = self.sequence_length
-        config.max_seq_len = self.sequence_length
-        config.text_encoder_context_length = self.sequence_length
-        config.text_encoder_n_ctx = self.n_ctx
-
-        return config
-
-    def prepare_config_and_inputs_for_common(self):
-        config, pixel_values, task_inputs, pixel_mask, _, _, _ = self.prepare_config_and_inputs()
-        inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask, "task_inputs": task_inputs}
-        return config, inputs_dict
-
-    def check_output_hidden_state(self, output, config):
-        encoder_hidden_states = output.encoder_hidden_states
-        pixel_decoder_hidden_states = output.pixel_decoder_hidden_states
-        transformer_decoder_hidden_states = output.transformer_decoder_hidden_states
-
-        self.parent.assertTrue(len(encoder_hidden_states), len(config.backbone_config.depths))
-        self.parent.assertTrue(len(pixel_decoder_hidden_states), config.encoder_layers)
-        self.parent.assertTrue(len(transformer_decoder_hidden_states), config.decoder_layers - 1)
-
-    def create_and_check_oneformer_model(
-        self, config, pixel_values, task_inputs, pixel_mask, output_hidden_states=False
-    ):
-        with no_grad():
-            model = OneFormerModel(config=config)
-            model.eval()
-
-            output = model(pixel_values=pixel_values, task_inputs=task_inputs, pixel_mask=pixel_mask)
-            output = model(pixel_values, task_inputs=task_inputs, output_hidden_states=True)
-        # the correct shape of output.transformer_decoder_hidden_states ensure the correcteness of the
-        # encoder and pixel decoder
-        self.parent.assertEqual(
-            output.transformer_decoder_object_queries.shape,
-            (self.batch_size, self.num_queries, self.hidden_dim),
-        )
-        # let's ensure the other two hidden state exists
-        self.parent.assertTrue(output.pixel_decoder_hidden_states is not None)
-        self.parent.assertTrue(output.encoder_hidden_states is not None)
-
-        if output_hidden_states:
-            self.check_output_hidden_state(output, config)
-
-    def create_and_check_oneformer_universal_segmentation_head_model(
-        self, config, pixel_values, task_inputs, text_inputs, pixel_mask, mask_labels, class_labels
-    ):
-        model = OneFormerForUniversalSegmentation(config=config)
-        model.eval()
-
-        def comm_check_on_output(result):
-            # let's still check that all the required stuff is there
-            self.parent.assertTrue(result.transformer_decoder_hidden_states is not None)
-            self.parent.assertTrue(result.pixel_decoder_hidden_states is not None)
-            self.parent.assertTrue(result.encoder_hidden_states is not None)
-            # okay, now we need to check the logits shape
-            # due to the encoder compression, masks have a //4 spatial size
-            self.parent.assertEqual(
-                result.masks_queries_logits.shape,
-                (self.batch_size, self.num_queries, self.min_size // 4, self.max_size // 4),
-            )
-            # + 1 for null class
-            self.parent.assertEqual(
-                result.class_queries_logits.shape, (self.batch_size, self.num_queries, self.num_labels + 1)
-            )
-
-        with no_grad():
-            result = model(pixel_values=pixel_values, task_inputs=task_inputs, pixel_mask=pixel_mask)
-            result = model(pixel_values, task_inputs)
-
-            comm_check_on_output(result)
-
-        config.is_training = True
-        model = OneFormerForUniversalSegmentation(config=config)
-        model.eval()
-
-        with no_grad():
-            result = model(
-                pixel_values=pixel_values,
-                task_inputs=task_inputs,
-                pixel_mask=pixel_mask,
-                mask_labels=mask_labels,
-                class_labels=class_labels,
-                text_inputs=text_inputs,
-            )
-
-        comm_check_on_output(result)
-
-        self.parent.assertTrue(result.loss is not None)
-        self.parent.assertEqual(result.loss.shape, (1,))
-
-
-@require_mindspore
-class OneFormerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (OneFormerModel, OneFormerForUniversalSegmentation) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"feature-extraction": OneFormerModel} if is_mindspore_available() else {}
-
-    is_encoder_decoder = False
-    test_pruning = False
-    test_head_masking = False
-    test_missing_keys = False
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "FeatureExtractionPipelineTests":
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = OneFormerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=OneFormerConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_oneformer_model(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.create_and_check_oneformer_model(config, **inputs, output_hidden_states=False)
-
-    def test_oneformer_universal_segmentation_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_oneformer_universal_segmentation_head_model(*config_and_inputs)
-
-    def test_model_main_input_name(self):
-        for model_class in self.all_model_classes:
-            model_signature = inspect.signature(getattr(model_class, "forward"))
-            # The main input is the name of the argument after `self`
-            observed_main_input_name = list(model_signature.parameters.keys())[1:3]
-            self.assertEqual(model_class.main_input_name, observed_main_input_name)
-
-    @unittest.skip(reason="OneFormer uses two main inputs")
-    def test_torchscript_simple(self):
-        pass
-
-    @unittest.skip(reason="OneFormer uses two main inputs")
-    def test_torchscript_output_attentions(self):
-        pass
-
-    @unittest.skip(reason="OneFormer uses two main inputs")
-    def test_torchscript_output_hidden_state(self):
-        pass
-
-    @unittest.skip(reason="OneFormer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="OneFormer does not have a get_input_embeddings method")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="OneFormer is not a generative model")
-    def test_generate_without_input_ids(self):
-        pass
-
-    @unittest.skip(reason="OneFormer does not use token embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values", "task_inputs"]
-            self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ["shi-labs/oneformer_ade20k_swin_tiny"]:
-            model = OneFormerModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_model_with_labels(self):
-        size = (self.model_tester.min_size,) * 2
-        inputs = {
-            "pixel_values": ops.randn((2, 3, *size)),
-            "task_inputs": ops.randint(high=self.model_tester.vocab_size, size=(2, 77)).long(),
-            "text_inputs": ops.randint(
-                high=self.model_tester.vocab_size, size=(2, 6, 77)
-            ).long(),
-            "mask_labels": ops.randn((2, 150, *size)),
-            "class_labels": ops.zeros(2, 150).long(),
-        }
-
-        config = self.model_tester.get_config()
-        config.is_training = True
-
-        model = OneFormerForUniversalSegmentation(config)
-        outputs = model(**inputs)
-        self.assertTrue(outputs.loss is not None)
-
-    def test_hidden_states_output(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.create_and_check_oneformer_model(config, **inputs, output_hidden_states=True)
-
-    def test_attention_outputs(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            outputs = model(**inputs, output_attentions=True)
-            self.assertTrue(outputs.attentions is not None)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.contrastive_temperature = 1
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((ops.mean(param) * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="model_tester.is_training is set to False")
-        # only OneFormerForUniversalSegmentation has the loss
-        model_class = self.all_model_classes[1]
-        (
-            config,
-            pixel_values,
-            task_inputs,
-            text_inputs,
-            pixel_mask,
-            mask_labels,
-            class_labels,
-        ) = self.model_tester.prepare_config_and_inputs()
-        config.is_training = True
-
-        model = model_class(config)
-        model.train()
-
-        loss = model(
-            pixel_values, task_inputs, text_inputs=text_inputs, mask_labels=mask_labels, class_labels=class_labels
-        ).loss
-        loss.backward()
-
-
-TOLERANCE = 1e-4
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_vision
-@slow
-class OneFormerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def model_checkpoints(self):
-        return "shi-labs/oneformer_ade20k_swin_tiny"
-
-    @cached_property
-    def default_processor(self):
-        return OneFormerProcessor.from_pretrained(self.model_checkpoints) if is_vision_available() else None
-
-    def test_inference_no_head(self):
-        model = OneFormerModel.from_pretrained(self.model_checkpoints)
-        processor = self.default_processor
-        image = prepare_img()
-        inputs = processor(image, ["semantic"], return_tensors="ms")
-        inputs_shape = inputs["pixel_values"].shape
-        # check size
-        self.assertEqual(inputs_shape, (1, 3, 512, 682))
-
-        task_inputs_shape = inputs["task_inputs"].shape
-        # check size
-        self.assertEqual(task_inputs_shape, (1, 77))
-
-        with no_grad():
-            outputs = model(**inputs)
-
-        expected_slice_hidden_state = mindspore.tensor(
-            [[0.2723, 0.8280, 0.6026], [1.2699, 1.1257, 1.1444], [1.1344, 0.6153, 0.4177]]
-        )
-        self.assertTrue(
-            ops.allclose(
-                outputs.encoder_hidden_states[-1][0, 0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
-            )
-        )
-
-        expected_slice_hidden_state = mindspore.tensor(
-            [[1.0581, 1.2276, 1.2003], [1.1903, 1.2925, 1.2862], [1.158, 1.2559, 1.3216]]
-        )
-        self.assertTrue(
-            ops.allclose(
-                outputs.pixel_decoder_hidden_states[0][0, 0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
-            )
-        )
-
-        expected_slice_hidden_state = mindspore.tensor(
-            [[3.0668, -1.1833, -5.1103], [3.344, -3.362, -5.1101], [2.6017, -4.3613, -4.1444]]
-        )
-        self.assertTrue(
-            ops.allclose(
-                outputs.transformer_decoder_class_predictions[0, :3, :3], expected_slice_hidden_state, atol=TOLERANCE
-            )
-        )
-
-    def test_inference_universal_segmentation_head(self):
-        model = OneFormerForUniversalSegmentation.from_pretrained(self.model_checkpoints).eval()
-        processor = self.default_processor
-        image = prepare_img()
-        inputs = processor(image, ["semantic"], return_tensors="ms")
-        inputs_shape = inputs["pixel_values"].shape
-        # check size
-        self.assertEqual(inputs_shape, (1, 3, 512, 682))
-
-        with no_grad():
-            outputs = model(**inputs)
-
-        # masks_queries_logits
-        masks_queries_logits = outputs.masks_queries_logits
-        self.assertEqual(
-            masks_queries_logits.shape,
-            (1, model.config.num_queries, inputs_shape[-2] // 4, (inputs_shape[-1] + 2) // 4),
-        )
-        expected_slice = [[[3.1848, 4.2141, 4.1993], [2.9000, 3.5721, 3.6603], [2.5358, 3.0883, 3.6168]]]
-        expected_slice = mindspore.tensor(expected_slice)
-        self.assertTrue(ops.allclose(masks_queries_logits[0, 0, :3, :3], expected_slice, atol=TOLERANCE))
-        # class_queries_logits
-        class_queries_logits = outputs.class_queries_logits
-        self.assertEqual(
-            class_queries_logits.shape,
-            (1, model.config.num_queries, model.config.num_labels + 1),
-        )
-        expected_slice = mindspore.tensor(
-            [[3.0668, -1.1833, -5.1103], [3.3440, -3.3620, -5.1101], [2.6017, -4.3613, -4.1444]]
-        )
-        self.assertTrue(ops.allclose(class_queries_logits[0, :3, :3], expected_slice, atol=TOLERANCE))
-
-    def test_inference_fp16(self):
-        model = (
-            OneFormerForUniversalSegmentation.from_pretrained(self.model_checkpoints)
-            .to(mindspore.float16)
-            .eval()
-        )
-        processor = self.default_processor
-        image = prepare_img()
-        inputs = processor(image, ["semantic"], return_tensors="ms").to(mindspore.float16)
-
-        with no_grad():
-            _ = model(**inputs)
-
-    def test_with_segmentation_maps_and_loss(self):
-        dummy_model = OneFormerForUniversalSegmentation.from_pretrained(self.model_checkpoints)
-        processor = self.default_processor
-        processor.image_processor.num_text = dummy_model.config.num_queries - dummy_model.config.text_encoder_n_ctx
-        dummy_model.config.is_training = True
-        model = OneFormerForUniversalSegmentation(dummy_model.config).eval()
-        del dummy_model
-
-        inputs = processor(
-            [np.zeros((3, 512, 640)), np.zeros((3, 512, 640))],
-            ["semantic", "semantic"],
-            segmentation_maps=[np.zeros((384, 384)).astype(np.float32), np.zeros((384, 384)).astype(np.float32)],
-            return_tensors="ms",
-        )
-
-        inputs["pixel_values"] = inputs["pixel_values"]
-        inputs["task_inputs"] = inputs["task_inputs"]
-        inputs["text_inputs"] = inputs["text_inputs"]
-        inputs["mask_labels"] = [el for el in inputs["mask_labels"]]
-        inputs["class_labels"] = [el for el in inputs["class_labels"]]
-
-        with no_grad():
-            outputs = model(**inputs)
-
-        self.assertTrue(outputs.loss is not None)
\ No newline at end of file
diff --git a/tests/transformers/models/oneformer/test_processor_oneformer.py b/tests/transformers/models/oneformer/test_processor_oneformer.py
deleted file mode 100644
index 493d70e29..000000000
--- a/tests/transformers/models/oneformer/test_processor_oneformer.py
+++ /dev/null
@@ -1,804 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch OneFormer processor."""
-# pylint: disable=line-too-long
-# pylint: disable=not-callable
-
-import json
-import os
-import tempfile
-import unittest
-
-import numpy as np
-from mindspore import ops
-
-from datasets import load_dataset
-from huggingface_hub import hf_hub_download
-
-from mindnlp.utils.testing_utils import check_json_file_has_correct_format, require_mindspore, require_vision
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import prepare_image_inputs
-
-
-if is_mindspore_available():
-    import mindspore
-
-    if is_vision_available():
-        from mindnlp.transformers.models.oneformer import (
-            OneFormerImageProcessor,
-            OneFormerProcessor
-        )
-        from mindnlp.transformers import CLIPTokenizer
-        from mindnlp.transformers.models.oneformer.image_processing_oneformer import binary_mask_to_rle
-        from mindnlp.transformers.models.oneformer.modeling_oneformer import OneFormerForUniversalSegmentationOutput
-
-if is_vision_available():
-    from PIL import Image
-
-
-def prepare_metadata(class_info_file, repo_path="shi-labs/oneformer_demo"):
-    with open(hf_hub_download(repo_path, class_info_file, repo_type="dataset"), "r") as f:
-        class_info = json.load(f)
-    metadata = {}
-    class_names = []
-    thing_ids = []
-
-    for key, info in class_info.items():
-        metadata[key] = info["name"]
-        class_names.append(info["name"])
-        if info["isthing"]:
-            thing_ids.append(int(key))
-
-    metadata["thing_ids"] = thing_ids
-    metadata["class_names"] = class_names
-    return metadata
-
-
-class OneFormerProcessorTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        size=None,
-        do_resize=True,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        num_labels=10,
-        reduce_labels=False,
-        ignore_index=255,
-        max_seq_length=77,
-        task_seq_length=77,
-        model_repo="shi-labs/oneformer_ade20k_swin_tiny",
-        class_info_file="ade20k_panoptic.json",
-        num_text=10,
-    ):
-        super().__init__()
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = {"shortest_edge": 32, "longest_edge": 1333} if size is None else size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.max_seq_length = max_seq_length
-        self.task_seq_length = task_seq_length
-        self.class_info_file = class_info_file
-        self.metadata = prepare_metadata(class_info_file)
-        self.num_text = num_text
-        self.model_repo = model_repo
-
-        # for the post_process_functions
-        self.batch_size = 2
-        self.num_queries = 10
-        self.num_classes = 10
-        self.height = 3
-        self.width = 4
-        self.num_labels = num_labels
-        self.reduce_labels = reduce_labels
-        self.ignore_index = ignore_index
-
-    def prepare_processor_dict(self):
-        image_processor_dict = {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "num_labels": self.num_labels,
-            "reduce_labels": self.reduce_labels,
-            "ignore_index": self.ignore_index,
-            "class_info_file": self.class_info_file,
-            "metadata": self.metadata,
-            "num_text": self.num_text,
-        }
-
-        image_processor = OneFormerImageProcessor(**image_processor_dict)
-        tokenizer = CLIPTokenizer.from_pretrained(self.model_repo)
-
-        return {
-            "image_processor": image_processor,
-            "tokenizer": tokenizer,
-            "max_seq_length": self.max_seq_length,
-            "task_seq_length": self.task_seq_length,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to OneFormerProcessor,
-        assuming do_resize is set to True with a scalar size. It also provides the expected sequence length
-        for the task_inputs and text_list_input.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            if w < h:
-                expected_height = int(self.size["shortest_edge"] * h / w)
-                expected_width = self.size["shortest_edge"]
-            elif w > h:
-                expected_height = self.size["shortest_edge"]
-                expected_width = int(self.size["shortest_edge"] * w / h)
-            else:
-                expected_height = self.size["shortest_edge"]
-                expected_width = self.size["shortest_edge"]
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width, expected_sequence_length = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width, expected_sequence_length))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        expected_sequence_length = self.max_seq_length
-
-        return expected_height, expected_width, expected_sequence_length
-
-    def get_fake_oneformer_outputs(self):
-        return OneFormerForUniversalSegmentationOutput(
-            # +1 for null class
-            class_queries_logits=ops.randn((self.batch_size, self.num_queries, self.num_classes + 1)),
-            masks_queries_logits=ops.randn((self.batch_size, self.num_queries, self.height, self.width)),
-        )
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class OneFormerProcessingTest(unittest.TestCase):
-    processing_class = OneFormerProcessor if (is_vision_available() and is_mindspore_available()) else None
-    # only for test_feat_extracttion_common.test_feat_extract_to_json_string
-    feature_extraction_class = processing_class
-
-    def setUp(self):
-        self.processing_tester = OneFormerProcessorTester(self)
-
-    @property
-    def processor_dict(self):
-        return self.processing_tester.prepare_processor_dict()
-
-    def test_feat_extract_properties(self):
-        processor = self.processing_class(**self.processor_dict)
-        self.assertTrue(hasattr(processor, "image_processor"))
-        self.assertTrue(hasattr(processor, "tokenizer"))
-        self.assertTrue(hasattr(processor, "max_seq_length"))
-        self.assertTrue(hasattr(processor, "task_seq_length"))
-
-    def test_batch_feature(self):
-        pass
-
-    def test_call_pil(self):
-        # Initialize processor
-        processor = self.processing_class(**self.processor_dict)
-        # create random PIL images
-        image_inputs = self.processing_tester.prepare_image_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = processor(image_inputs[0], ["semantic"], return_tensors="ms").pixel_values
-
-        expected_height, expected_width, expected_sequence_length = self.processing_tester.get_expected_values(
-            image_inputs
-        )
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.processing_tester.num_channels, expected_height, expected_width),
-        )
-
-        tokenized_task_inputs = processor(image_inputs[0], ["semantic"], return_tensors="ms").task_inputs
-
-        self.assertEqual(
-            tokenized_task_inputs.shape,
-            (1, expected_sequence_length),
-        )
-
-        # Test batched
-        expected_height, expected_width, expected_sequence_length = self.processing_tester.get_expected_values(
-            image_inputs, batched=True
-        )
-
-        encoded_images = processor(image_inputs, ["semantic"] * len(image_inputs), return_tensors="ms").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.processing_tester.batch_size,
-                self.processing_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-        tokenized_task_inputs = processor(
-            image_inputs, ["semantic"] * len(image_inputs), return_tensors="ms"
-        ).task_inputs
-
-        self.assertEqual(
-            tokenized_task_inputs.shape,
-            (self.processing_tester.batch_size, expected_sequence_length),
-        )
-
-    def test_call_numpy(self):
-        # Initialize processor
-        processor = self.processing_class(**self.processor_dict)
-        # create random numpy tensors
-        image_inputs = self.processing_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = processor(image_inputs[0], ["semantic"], return_tensors="ms").pixel_values
-
-        expected_height, expected_width, expected_sequence_length = self.processing_tester.get_expected_values(
-            image_inputs
-        )
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.processing_tester.num_channels, expected_height, expected_width),
-        )
-
-        tokenized_task_inputs = processor(image_inputs[0], ["semantic"], return_tensors="ms").task_inputs
-
-        self.assertEqual(
-            tokenized_task_inputs.shape,
-            (1, expected_sequence_length),
-        )
-
-        # Test batched
-        expected_height, expected_width, expected_sequence_length = self.processing_tester.get_expected_values(
-            image_inputs, batched=True
-        )
-
-        encoded_images = processor(image_inputs, ["semantic"] * len(image_inputs), return_tensors="ms").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.processing_tester.batch_size,
-                self.processing_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-        tokenized_task_inputs = processor(
-            image_inputs, ["semantic"] * len(image_inputs), return_tensors="ms"
-        ).task_inputs
-
-        self.assertEqual(
-            tokenized_task_inputs.shape,
-            (self.processing_tester.batch_size, expected_sequence_length),
-        )
-
-    def test_call_pytorch(self):
-        # Initialize processor
-        processor = self.processing_class(**self.processor_dict)
-        # create random MindSpore tensors
-        image_inputs = self.processing_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, mindspore.Tensor)
-
-        # Test not batched input
-        encoded_images = processor(image_inputs[0], ["semantic"], return_tensors="ms").pixel_values
-
-        expected_height, expected_width, expected_sequence_length = self.processing_tester.get_expected_values(
-            image_inputs
-        )
-
-        self.assertEqual(
-            encoded_images.shape,
-            (1, self.processing_tester.num_channels, expected_height, expected_width),
-        )
-
-        tokenized_task_inputs = processor(image_inputs[0], ["semantic"], return_tensors="ms").task_inputs
-
-        self.assertEqual(
-            tokenized_task_inputs.shape,
-            (1, expected_sequence_length),
-        )
-
-        # Test batched
-        expected_height, expected_width, expected_sequence_length = self.processing_tester.get_expected_values(
-            image_inputs, batched=True
-        )
-
-        encoded_images = processor(image_inputs, ["semantic"] * len(image_inputs), return_tensors="ms").pixel_values
-        self.assertEqual(
-            encoded_images.shape,
-            (
-                self.processing_tester.batch_size,
-                self.processing_tester.num_channels,
-                expected_height,
-                expected_width,
-            ),
-        )
-
-        tokenized_task_inputs = processor(
-            image_inputs, ["semantic"] * len(image_inputs), return_tensors="ms"
-        ).task_inputs
-
-        self.assertEqual(
-            tokenized_task_inputs.shape,
-            (self.processing_tester.batch_size, expected_sequence_length),
-        )
-
-    def comm_get_processor_inputs(self, with_segmentation_maps=False, is_instance_map=False, segmentation_type="np"):
-        processor = self.processing_class(**self.processor_dict)
-        # prepare image and target
-        num_labels = self.processing_tester.num_labels
-        annotations = None
-        instance_id_to_semantic_id = None
-        image_inputs = self.processing_tester.prepare_image_inputs(equal_resolution=False)
-        if with_segmentation_maps:
-            high = num_labels
-            if is_instance_map:
-                labels_expanded = list(range(num_labels)) * 2
-                instance_id_to_semantic_id = dict(enumerate(labels_expanded))
-            annotations = [
-                np.random.randint(0, high * 2, (img.size[1], img.size[0])).astype(np.uint8) for img in image_inputs
-            ]
-            if segmentation_type == "pil":
-                annotations = [Image.fromarray(annotation) for annotation in annotations]
-
-        inputs = processor(
-            image_inputs,
-            ["semantic"] * len(image_inputs),
-            annotations,
-            return_tensors="ms",
-            instance_id_to_semantic_id=instance_id_to_semantic_id,
-            pad_and_return_pixel_mask=True,
-        )
-
-        return inputs
-
-    def test_init_without_params(self):
-        pass
-
-    def test_feat_extract_from_and_save_pretrained(self):
-        feat_extract_first = self.feature_extraction_class(**self.processor_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            feat_extract_first.save_pretrained(tmpdirname)
-            check_json_file_has_correct_format(os.path.join(tmpdirname, "preprocessor_config.json"))
-            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
-
-        self.assertEqual(feat_extract_second.image_processor.to_dict(), feat_extract_first.image_processor.to_dict())
-        self.assertIsInstance(feat_extract_first.image_processor, OneFormerImageProcessor)
-        self.assertIsInstance(feat_extract_first.tokenizer, CLIPTokenizer)
-
-    def test_call_with_segmentation_maps(self):
-        def common(is_instance_map=False, segmentation_type=None):
-            inputs = self.comm_get_processor_inputs(
-                with_segmentation_maps=True, is_instance_map=is_instance_map, segmentation_type=segmentation_type
-            )
-
-            mask_labels = inputs["mask_labels"]
-            class_labels = inputs["class_labels"]
-            pixel_values = inputs["pixel_values"]
-            text_inputs = inputs["text_inputs"]
-
-            # check the batch_size
-            for mask_label, class_label, text_input in zip(mask_labels, class_labels, text_inputs):
-                self.assertEqual(mask_label.shape[0], class_label.shape[0])
-                # this ensure padding has happened
-                self.assertEqual(mask_label.shape[1:], pixel_values.shape[2:])
-                self.assertEqual(text_input.shape[0], self.processing_tester.num_text)
-
-        common()
-        common(is_instance_map=True)
-        common(is_instance_map=False, segmentation_type="pil")
-        common(is_instance_map=True, segmentation_type="pil")
-
-    def test_integration_semantic_segmentation(self):
-        # load 2 images and corresponding panoptic annotations from the hub
-        dataset = load_dataset("nielsr/ade20k-panoptic-demo")
-        image1 = dataset["train"][0]["image"]
-        image2 = dataset["train"][1]["image"]
-        segments_info1 = dataset["train"][0]["segments_info"]
-        segments_info2 = dataset["train"][1]["segments_info"]
-        annotation1 = dataset["train"][0]["label"]
-        annotation2 = dataset["train"][1]["label"]
-
-        def rgb_to_id(color):
-            if isinstance(color, np.ndarray) and len(color.shape) == 3:
-                if color.dtype == np.uint8:
-                    color = color.astype(np.int32)
-                return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
-            return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
-
-        def create_panoptic_map(annotation, segments_info):
-            annotation = np.array(annotation)
-            # convert RGB to segment IDs per pixel
-            # 0 is the "ignore" label, for which we don't need to make binary masks
-            panoptic_map = rgb_to_id(annotation)
-
-            # create mapping between segment IDs and semantic classes
-            inst2class = {segment["id"]: segment["category_id"] for segment in segments_info}
-
-            return panoptic_map, inst2class
-
-        panoptic_map1, inst2class1 = create_panoptic_map(annotation1, segments_info1)
-        panoptic_map2, inst2class2 = create_panoptic_map(annotation2, segments_info2)
-
-        image_processor = OneFormerImageProcessor(
-            reduce_labels=True,
-            ignore_index=0,
-            size=(512, 512),
-            class_info_file="ade20k_panoptic.json",
-            num_text=self.processing_tester.num_text,
-        )
-
-        tokenizer = CLIPTokenizer.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
-
-        processor = OneFormerProcessor(
-            image_processor=image_processor,
-            tokenizer=tokenizer,
-            max_seq_length=77,
-            task_seq_length=77,
-        )
-
-        # prepare the images and annotations
-        pixel_values_list = [np.moveaxis(np.array(image1), -1, 0), np.moveaxis(np.array(image2), -1, 0)]
-        inputs = processor.encode_inputs(
-            pixel_values_list,
-            ["semantic", "semantic"],
-            [panoptic_map1, panoptic_map2],
-            instance_id_to_semantic_id=[inst2class1, inst2class2],
-            return_tensors="ms",
-        )
-
-        # verify the pixel values, task inputs, text inputs and pixel mask
-        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 711))
-        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 711))
-        self.assertEqual(inputs["task_inputs"].shape, (2, 77))
-        self.assertEqual(inputs["text_inputs"].shape, (2, self.processing_tester.num_text, 77))
-
-        # verify the class labels
-        self.assertEqual(len(inputs["class_labels"]), 2)
-        expected_class_labels = mindspore.tensor([4, 17, 32, 42, 12, 3, 5, 0, 43, 96, 104, 31, 125, 138, 87, 149])  # noqa: E231  # fmt: skip
-        self.assertTrue(np.allclose(inputs["class_labels"][0].numpy(), expected_class_labels.numpy()))
-        expected_class_labels = mindspore.tensor([19, 67, 82, 17, 12, 42, 3, 14, 5, 0, 115, 43, 8, 138, 125, 143])  # noqa: E231  # fmt: skip
-        self.assertTrue(np.allclose(inputs["class_labels"][1].numpy(), expected_class_labels.numpy()))
-
-        # verify the task inputs
-        self.assertEqual(len(inputs["task_inputs"]), 2)
-        self.assertEqual(inputs["task_inputs"][0].sum().item(), 141082)
-        self.assertEqual(inputs["task_inputs"][0].sum().item(), inputs["task_inputs"][1].sum().item())
-
-        # verify the text inputs
-        self.assertEqual(len(inputs["text_inputs"]), 2)
-        self.assertEqual(inputs["text_inputs"][0].sum().item(), 1095752)
-        self.assertEqual(inputs["text_inputs"][1].sum().item(), 1062468)
-
-        # verify the mask labels
-        self.assertEqual(len(inputs["mask_labels"]), 2)
-        self.assertEqual(inputs["mask_labels"][0].shape, (16, 512, 711))
-        self.assertEqual(inputs["mask_labels"][1].shape, (16, 512, 711))
-        self.assertEqual(inputs["mask_labels"][0].sum().item(), 315193.0)
-        self.assertEqual(inputs["mask_labels"][1].sum().item(), 350747.0)
-
-    def test_integration_instance_segmentation(self):
-        # load 2 images and corresponding panoptic annotations from the hub
-        dataset = load_dataset("nielsr/ade20k-panoptic-demo")
-        image1 = dataset["train"][0]["image"]
-        image2 = dataset["train"][1]["image"]
-        segments_info1 = dataset["train"][0]["segments_info"]
-        segments_info2 = dataset["train"][1]["segments_info"]
-        annotation1 = dataset["train"][0]["label"]
-        annotation2 = dataset["train"][1]["label"]
-
-        def rgb_to_id(color):
-            if isinstance(color, np.ndarray) and len(color.shape) == 3:
-                if color.dtype == np.uint8:
-                    color = color.astype(np.int32)
-                return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
-            return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
-
-        def create_panoptic_map(annotation, segments_info):
-            annotation = np.array(annotation)
-            # convert RGB to segment IDs per pixel
-            # 0 is the "ignore" label, for which we don't need to make binary masks
-            panoptic_map = rgb_to_id(annotation)
-
-            # create mapping between segment IDs and semantic classes
-            inst2class = {segment["id"]: segment["category_id"] for segment in segments_info}
-
-            return panoptic_map, inst2class
-
-        panoptic_map1, inst2class1 = create_panoptic_map(annotation1, segments_info1)
-        panoptic_map2, inst2class2 = create_panoptic_map(annotation2, segments_info2)
-
-        image_processor = OneFormerImageProcessor(
-            reduce_labels=True,
-            ignore_index=0,
-            size=(512, 512),
-            class_info_file="ade20k_panoptic.json",
-            num_text=self.processing_tester.num_text,
-        )
-
-        tokenizer = CLIPTokenizer.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
-
-        processor = OneFormerProcessor(
-            image_processor=image_processor,
-            tokenizer=tokenizer,
-            max_seq_length=77,
-            task_seq_length=77,
-        )
-
-        # prepare the images and annotations
-        pixel_values_list = [np.moveaxis(np.array(image1), -1, 0), np.moveaxis(np.array(image2), -1, 0)]
-        inputs = processor.encode_inputs(
-            pixel_values_list,
-            ["instance", "instance"],
-            [panoptic_map1, panoptic_map2],
-            instance_id_to_semantic_id=[inst2class1, inst2class2],
-            return_tensors="ms",
-        )
-
-        # verify the pixel values, task inputs, text inputs and pixel mask
-        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 711))
-        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 711))
-        self.assertEqual(inputs["task_inputs"].shape, (2, 77))
-        self.assertEqual(inputs["text_inputs"].shape, (2, self.processing_tester.num_text, 77))
-
-        # verify the class labels
-        self.assertEqual(len(inputs["class_labels"]), 2)
-        expected_class_labels = mindspore.tensor([32, 42, 42, 42, 42, 42, 42, 42, 32, 12, 12, 12, 12, 12, 42, 42, 12, 12, 12, 42, 12, 12, 12, 12, 12, 12, 12, 12, 12, 42, 42, 42, 12, 42, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 43, 43, 43, 43, 104, 43, 31, 125, 31, 125, 138, 87, 125, 149, 138, 125, 87, 87])  # fmt: skip
-        self.assertTrue(np.allclose(inputs["class_labels"][0].numpy(), expected_class_labels.numpy()))
-        expected_class_labels = mindspore.tensor([19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 67, 82, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 12, 12, 42, 12, 12, 12, 12, 14, 12, 12, 12, 12, 12, 12, 12, 12, 14, 12, 12, 115, 43, 43, 115, 43, 43, 43, 8, 8, 8, 138, 138, 125, 143])  # fmt: skip
-        self.assertTrue(np.allclose(inputs["class_labels"][1].numpy(), expected_class_labels.numpy()))
-
-        # verify the task inputs
-        self.assertEqual(len(inputs["task_inputs"]), 2)
-        self.assertEqual(inputs["task_inputs"][0].sum().item(), 144985)
-        self.assertEqual(inputs["task_inputs"][0].sum().item(), inputs["task_inputs"][1].sum().item())
-
-        # verify the text inputs
-        self.assertEqual(len(inputs["text_inputs"]), 2)
-        self.assertEqual(inputs["text_inputs"][0].sum().item(), 1037040)
-        self.assertEqual(inputs["text_inputs"][1].sum().item(), 1044078)
-
-        # verify the mask labels
-        self.assertEqual(len(inputs["mask_labels"]), 2)
-        self.assertEqual(inputs["mask_labels"][0].shape, (73, 512, 711))
-        self.assertEqual(inputs["mask_labels"][1].shape, (57, 512, 711))
-        self.assertEqual(inputs["mask_labels"][0].sum().item(), 35040.0)
-        self.assertEqual(inputs["mask_labels"][1].sum().item(), 98228.0)
-
-    def test_integration_panoptic_segmentation(self):
-        # load 2 images and corresponding panoptic annotations from the hub
-        dataset = load_dataset("nielsr/ade20k-panoptic-demo")
-        image1 = dataset["train"][0]["image"]
-        image2 = dataset["train"][1]["image"]
-        segments_info1 = dataset["train"][0]["segments_info"]
-        segments_info2 = dataset["train"][1]["segments_info"]
-        annotation1 = dataset["train"][0]["label"]
-        annotation2 = dataset["train"][1]["label"]
-
-        def rgb_to_id(color):
-            if isinstance(color, np.ndarray) and len(color.shape) == 3:
-                if color.dtype == np.uint8:
-                    color = color.astype(np.int32)
-                return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
-            return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
-
-        def create_panoptic_map(annotation, segments_info):
-            annotation = np.array(annotation)
-            # convert RGB to segment IDs per pixel
-            # 0 is the "ignore" label, for which we don't need to make binary masks
-            panoptic_map = rgb_to_id(annotation)
-
-            # create mapping between segment IDs and semantic classes
-            inst2class = {segment["id"]: segment["category_id"] for segment in segments_info}
-
-            return panoptic_map, inst2class
-
-        panoptic_map1, inst2class1 = create_panoptic_map(annotation1, segments_info1)
-        panoptic_map2, inst2class2 = create_panoptic_map(annotation2, segments_info2)
-
-        image_processor = OneFormerImageProcessor(
-            reduce_labels=True,
-            ignore_index=0,
-            size=(512, 512),
-            class_info_file="ade20k_panoptic.json",
-            num_text=self.processing_tester.num_text,
-        )
-
-        tokenizer = CLIPTokenizer.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
-
-        processor = OneFormerProcessor(
-            image_processor=image_processor,
-            tokenizer=tokenizer,
-            max_seq_length=77,
-            task_seq_length=77,
-        )
-
-        # prepare the images and annotations
-        pixel_values_list = [np.moveaxis(np.array(image1), -1, 0), np.moveaxis(np.array(image2), -1, 0)]
-        inputs = processor.encode_inputs(
-            pixel_values_list,
-            ["panoptic", "panoptic"],
-            [panoptic_map1, panoptic_map2],
-            instance_id_to_semantic_id=[inst2class1, inst2class2],
-            return_tensors="ms",
-        )
-
-        # verify the pixel values, task inputs, text inputs and pixel mask
-        self.assertEqual(inputs["pixel_values"].shape, (2, 3, 512, 711))
-        self.assertEqual(inputs["pixel_mask"].shape, (2, 512, 711))
-        self.assertEqual(inputs["task_inputs"].shape, (2, 77))
-        self.assertEqual(inputs["text_inputs"].shape, (2, self.processing_tester.num_text, 77))
-
-        # verify the class labels
-        self.assertEqual(len(inputs["class_labels"]), 2)
-        expected_class_labels = mindspore.tensor([4, 17, 32, 42, 42, 42, 42, 42, 42, 42, 32, 12, 12, 12, 12, 12, 42, 42, 12, 12, 12, 42, 12, 12, 12, 12, 12, 3, 12, 12, 12, 12, 42, 42, 42, 12, 42, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 5, 12, 12, 12, 12, 12, 12, 12, 0, 43, 43, 43, 96, 43, 104, 43, 31, 125, 31, 125, 138, 87, 125, 149, 138, 125, 87, 87])  # fmt: skip
-        self.assertTrue(np.allclose(inputs["class_labels"][0].numpy(), expected_class_labels.numpy()))
-        expected_class_labels = mindspore.tensor([19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 67, 82, 19, 19, 17, 19, 19, 19, 19, 19, 19, 19, 19, 19, 12, 12, 42, 12, 12, 12, 12, 3, 14, 12, 12, 12, 12, 12, 12, 12, 12, 14, 5, 12, 12, 0, 115, 43, 43, 115, 43, 43, 43, 8, 8, 8, 138, 138, 125, 143])  # fmt: skip
-        self.assertTrue(np.allclose(inputs["class_labels"][1].numpy(), expected_class_labels.numpy()))
-
-        # verify the task inputs
-        self.assertEqual(len(inputs["task_inputs"]), 2)
-        self.assertEqual(inputs["task_inputs"][0].sum().item(), 136240)
-        self.assertEqual(inputs["task_inputs"][0].sum().item(), inputs["task_inputs"][1].sum().item())
-
-        # verify the text inputs
-        self.assertEqual(len(inputs["text_inputs"]), 2)
-        self.assertEqual(inputs["text_inputs"][0].sum().item(), 1048653)
-        self.assertEqual(inputs["text_inputs"][1].sum().item(), 1067160)
-
-        # verify the mask labels
-        self.assertEqual(len(inputs["mask_labels"]), 2)
-        self.assertEqual(inputs["mask_labels"][0].shape, (79, 512, 711))
-        self.assertEqual(inputs["mask_labels"][1].shape, (61, 512, 711))
-        self.assertEqual(inputs["mask_labels"][0].sum().item(), 315193.0)
-        self.assertEqual(inputs["mask_labels"][1].sum().item(), 350747.0)
-
-    def test_binary_mask_to_rle(self):
-        fake_binary_mask = np.zeros((20, 50))
-        fake_binary_mask[0, 20:] = 1
-        fake_binary_mask[1, :15] = 1
-        fake_binary_mask[5, :10] = 1
-
-        rle = binary_mask_to_rle(fake_binary_mask)
-        self.assertEqual(len(rle), 4)
-        self.assertEqual(rle[0], 21)
-        self.assertEqual(rle[1], 45)
-
-    def test_post_process_semantic_segmentation(self):
-        image_processor = OneFormerImageProcessor(
-            reduce_labels=True,
-            ignore_index=0,
-            size=(512, 512),
-            class_info_file="ade20k_panoptic.json",
-            num_text=self.processing_tester.num_text,
-        )
-        tokenizer = CLIPTokenizer.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
-        processor = OneFormerProcessor(
-            image_processor=image_processor,
-            tokenizer=tokenizer,
-            max_seq_length=77,
-            task_seq_length=77,
-        )
-
-        outputs = self.processing_tester.get_fake_oneformer_outputs()
-
-        segmentation = processor.post_process_semantic_segmentation(outputs)
-
-        self.assertEqual(len(segmentation), self.processing_tester.batch_size)
-        self.assertEqual(
-            segmentation[0].shape,
-            (
-                self.processing_tester.height,
-                self.processing_tester.width,
-            ),
-        )
-
-        target_sizes = [(1, 4) for i in range(self.processing_tester.batch_size)]
-        segmentation = processor.post_process_semantic_segmentation(outputs, target_sizes=target_sizes)
-
-        self.assertEqual(segmentation[0].shape, target_sizes[0])
-
-    def test_post_process_instance_segmentation(self):
-        image_processor = OneFormerImageProcessor(
-            reduce_labels=True,
-            ignore_index=0,
-            size=(512, 512),
-            class_info_file="ade20k_panoptic.json",
-            num_text=self.processing_tester.num_text,
-        )
-        tokenizer = CLIPTokenizer.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
-        processor = OneFormerProcessor(
-            image_processor=image_processor,
-            tokenizer=tokenizer,
-            max_seq_length=77,
-            task_seq_length=77,
-        )
-
-        outputs = self.processing_tester.get_fake_oneformer_outputs()
-        segmentation = processor.post_process_instance_segmentation(outputs, threshold=0)
-
-        self.assertTrue(len(segmentation) == self.processing_tester.batch_size)
-        for el in segmentation:
-            self.assertTrue("segmentation" in el)
-            self.assertTrue("segments_info" in el)
-            self.assertEqual(type(el["segments_info"]), list)
-            self.assertEqual(el["segmentation"].shape, (self.processing_tester.height, self.processing_tester.width))
-
-    def test_post_process_panoptic_segmentation(self):
-        image_processor = OneFormerImageProcessor(
-            reduce_labels=True,
-            ignore_index=0,
-            size=(512, 512),
-            class_info_file="ade20k_panoptic.json",
-            num_text=self.processing_tester.num_text,
-        )
-        tokenizer = CLIPTokenizer.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
-        processor = OneFormerProcessor(
-            image_processor=image_processor,
-            tokenizer=tokenizer,
-            max_seq_length=77,
-            task_seq_length=77,
-        )
-
-        outputs = self.processing_tester.get_fake_oneformer_outputs()
-        segmentation = processor.post_process_panoptic_segmentation(outputs, threshold=0)
-
-        self.assertTrue(len(segmentation) == self.processing_tester.batch_size)
-        for el in segmentation:
-            self.assertTrue("segmentation" in el)
-            self.assertTrue("segments_info" in el)
-            self.assertEqual(type(el["segments_info"]), list)
-            self.assertEqual(el["segmentation"].shape, (self.processing_tester.height, self.processing_tester.width))
diff --git a/tests/transformers/models/openai/__init__.py b/tests/transformers/models/openai/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/openai/test_modeling_gpt.py b/tests/transformers/models/openai/test_modeling_gpt.py
deleted file mode 100644
index 1b5787a5f..000000000
--- a/tests/transformers/models/openai/test_modeling_gpt.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import (
-        OpenAIGPTConfig,
-        OpenAIGPTDoubleHeadsModel,
-        OpenAIGPTForSequenceClassification,
-        OpenAIGPTLMHeadModel,
-        OpenAIGPTModel,
-    )
-
-
-class OpenAIGPTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.pad_token_id = self.vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = OpenAIGPTConfig(
-            vocab_size=self.vocab_size,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            # intermediate_size=self.intermediate_size,
-            # hidden_act=self.hidden_act,
-            # hidden_dropout_prob=self.hidden_dropout_prob,
-            # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            # type_vocab_size=self.type_vocab_size,
-            # initializer_range=self.initializer_range
-            pad_token_id=self.pad_token_id,
-        )
-
-        head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-        return (
-            config,
-            input_ids,
-            head_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
-        model = OpenAIGPTModel(config=config)
-        model.eval()
-
-        result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
-        model = OpenAIGPTLMHeadModel(config)
-        model.eval()
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
-        model = OpenAIGPTDoubleHeadsModel(config)
-        model.eval()
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_openai_gpt_for_sequence_classification(
-        self, config, input_ids, head_mask, token_type_ids, *args
-    ):
-        config.num_labels = self.num_labels
-        model = OpenAIGPTForSequenceClassification(config)
-        model.eval()
-
-        sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-        result = model(input_ids, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            head_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "head_mask": head_mask,
-        }
-
-        return config, inputs_dict
-
-
-@require_mindspore
-class OpenAIGPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, OpenAIGPTForSequenceClassification)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (OpenAIGPTLMHeadModel,) if is_mindspore_available() else ()
-    )  # TODO (PVP): Add Double HeadsModel when generate() function is changed accordingly
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": OpenAIGPTModel,
-            "text-classification": OpenAIGPTForSequenceClassification,
-            "text-generation": OpenAIGPTLMHeadModel,
-            "zero-shot": OpenAIGPTForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "ZeroShotClassificationPipelineTests":
-            # Get `tokenizer does not have a padding token` error for both fast/slow tokenizers.
-            # `OpenAIGPTConfig` was never used in pipeline tests, either because of a missing checkpoint or because a
-            # tiny config could not be created.
-            return True
-
-        return False
-
-    # special case for DoubleHeads model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "OpenAIGPTDoubleHeadsModel":
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.num_choices, self.model_tester.seq_length),
-                    dtype=mindspore.int64,
-                )
-                inputs_dict["input_ids"] = inputs_dict["labels"]
-                inputs_dict["token_type_ids"] = inputs_dict["labels"]
-                inputs_dict["mc_token_ids"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.num_choices),
-                    dtype=mindspore.int64,
-                )
-                inputs_dict["mc_labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = OpenAIGPTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_openai_gpt_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_openai_gpt_model(*config_and_inputs)
-
-    def test_openai_gpt_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-    def test_openai_gpt_double_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
-
-    def test_openai_gpt_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_openai_gpt_for_sequence_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openai-community/openai-gpt"
-        model = OpenAIGPTModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class OPENAIGPTModelLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_openai_gpt(self):
-        model = OpenAIGPTLMHeadModel.from_pretrained("openai-community/openai-gpt")
-        input_ids = mindspore.tensor([[481, 4735, 544]], dtype=mindspore.int64)  # the president is
-        expected_output_ids = [
-            481,
-            4735,
-            544,
-            246,
-            963,
-            870,
-            762,
-            239,
-            244,
-            40477,
-            244,
-            249,
-            719,
-            881,
-            487,
-            544,
-            240,
-            244,
-            603,
-            481,
-        ]  # the president is a very good man. " \n " i\'m sure he is, " said the
-
-        output_ids = model.generate(input_ids, do_sample=False)
-        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
\ No newline at end of file
diff --git a/tests/transformers/models/opt/__init__.py b/tests/transformers/models/opt/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/opt/test_modeling_opt.py b/tests/transformers/models/opt/test_modeling_opt.py
deleted file mode 100644
index 050fe9a5f..000000000
--- a/tests/transformers/models/opt/test_modeling_opt.py
+++ /dev/null
@@ -1,561 +0,0 @@
-# coding=utf-8
-# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore OPT model. """
-
-
-import copy
-import tempfile
-import unittest
-import numpy as np
-
-from mindnlp.transformers import OPTConfig
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        GPT2Tokenizer,
-        OPTForCausalLM,
-        OPTForQuestionAnswering,
-        OPTForSequenceClassification,
-        OPTModel,
-    )
-
-
-def prepare_opt_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids=None,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    return {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "head_mask": head_mask,
-    }
-
-
-class OPTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-        embed_dim=16,
-        num_labels=3,
-        word_embed_proj_dim=16,
-        type_sequence_label_size=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.embed_dim = embed_dim
-        self.num_labels = num_labels
-        self.type_sequence_label_size = type_sequence_label_size
-        self.word_embed_proj_dim = word_embed_proj_dim
-        self.is_encoder_decoder = False
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-        inputs_dict = prepare_opt_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def get_config(self):
-        return OPTConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            embed_dim=self.embed_dim,
-            is_encoder_decoder=False,
-            word_embed_proj_dim=self.word_embed_proj_dim,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.max_position_embeddings = 100
-        return config
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = OPTModel(config=config).set_train(False)
-
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-        head_mask = inputs_dict["head_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = ops.cat([attention_mask.astype(next_attn_mask.dtype), next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-        # test no attention_mask works
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-        _, past_key_values = outputs.to_tuple()
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-
-@require_mindspore
-class OPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (OPTForCausalLM, OPTForSequenceClassification, OPTForQuestionAnswering)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (OPTForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": OPTModel,
-            "question-answering": OPTForQuestionAnswering,
-            "text-classification": OPTForSequenceClassification,
-            "text-generation": OPTForCausalLM,
-            "zero-shot": OPTForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = False
-    fx_compatible = True
-    test_pruning = False
-    test_missing_keys = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if (
-            pipeline_test_casse_name == "QAPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
-            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
-            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = OPTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=OPTConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (OPTModel,):
-            model = model_class(config)
-            model
-            model.set_train(False)
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            model(**inputs)[0]
-
-    @require_mindspore
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = OPTForCausalLM(config).set_train(False)
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    def test_opt_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = OPTForSequenceClassification(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_opt_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(mindspore.float32)
-        model = OPTForSequenceClassification(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
-    def test_model_parallelism(self):
-        super().test_model_parallelism()
-
-
-def assert_tensors_close(a, b, atol=1e-12, prefix=""):
-    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if np.allclose(a.asnumpy(), b.asnumpy(), atol=atol):
-            return True
-        raise
-    except Exception:
-        pct_different = (ops.gt((a - b).abs(), atol)).float().mean().item()
-        if a.size > 100:
-            msg = f"tensor values are {pct_different:.1%} percent different."
-        else:
-            msg = f"{a} != {b}"
-        if prefix:
-            msg = prefix + ": " + msg
-        raise AssertionError(msg)
-
-
-def _long_tensor(tok_lst):
-    return mindspore.tensor(tok_lst, dtype=mindspore.int64)
-
-
-@require_mindspore
-class OPTModelIntegrationTests(unittest.TestCase):
-    @slow
-    def test_inference_no_head(self):
-        model = OPTModel.from_pretrained("facebook/opt-350m", ms_dtype=mindspore.float32)
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids=input_ids).last_hidden_state
-
-        expected_shape = (1, 11, 512)
-        self.assertEqual(output.shape, expected_shape)
-        # expected value works for CPU, as well as GPU (with TF32 disabled)
-        expected_slice = mindspore.tensor(
-            [
-                [-0.28726277, -1.9241608, -0.3058734],
-                [-1.2737825, -0.13332152, -0.18766522],
-                [0.41159445, 0.1191957, -1.3107123],
-            ],
-        )
-        assert_tensors_close(output[0, :3, :3], expected_slice, atol=1e-3)
-
-
-@require_mindspore
-@slow
-class OPTEmbeddingsTest(unittest.TestCase):
-    def setUp(self):
-        super().setUp()
-        self.path_model = "facebook/opt-350m"
-
-    def test_load_model(self):
-        try:
-            _ = OPTForCausalLM.from_pretrained(self.path_model)
-        except BaseException:
-            self.fail("Failed loading model")
-
-    def test_logits(self):
-        model = OPTForCausalLM.from_pretrained(self.path_model, ms_dtype=mindspore.float32)
-        model = model.set_train(False)
-        tokenizer = GPT2Tokenizer.from_pretrained(self.path_model)
-
-        prompts = [
-            "Today is a beautiful day and I want to",
-            "In the city of",
-            "Paris is the capital of France and",
-            "Computers and mobile phones have taken",
-        ]
-        # verify that prompt without BOS token is identical to Metaseq -> add_special_tokens=False
-        inputs = tokenizer(prompts, return_tensors="ms", padding=True, add_special_tokens=False)
-        logits = model(inputs.input_ids, attention_mask=inputs.attention_mask)[0].mean(axis=-1)
-
-        logits_meta = mindspore.Tensor(
-            [
-                [1.3851, -13.8923, -10.5229, -10.7533, -0.2309, -10.2384, -0.5365, -9.0947, -5.1670],
-                [-4.7073, -10.6276, -3.9415, -21.5242, -0.2822, -0.2822, -0.2822, -0.2822, -0.2822],
-                [0.6247, -3.4229, -8.9179, -1.4297, -14.1650, 1.4146, -9.0218, -0.2703, -0.2703],
-                [6.4783, -1.9913, -10.7926, -2.3336, 1.5092, -0.9974, -6.8213, 1.3477, 1.3477],
-            ]
-        )
-        assert np.allclose(logits.asnumpy(), logits_meta.asnumpy(), atol=1e-3)
-
-
-@slow
-class OPTGenerationTest(unittest.TestCase):
-    @property
-    def prompts(self):
-        return [
-            "Today is a beautiful day and I want",
-            "In the city of",
-            "Paris is the capital of France and",
-            "Computers and mobile phones have taken",
-        ]
-
-    def test_generation_pre_attn_layer_norm(self):
-        model_id = "facebook/opt-125m"
-
-        EXPECTED_OUTPUTS = [
-            "Today is a beautiful day and I want to",
-            "In the city of New York, the city",
-            "Paris is the capital of France and the capital",
-            "Computers and mobile phones have taken over the",
-        ]
-
-        predicted_outputs = []
-        tokenizer = GPT2Tokenizer.from_pretrained(model_id)
-        model = OPTForCausalLM.from_pretrained(model_id)
-
-        for prompt in self.prompts:
-            input_ids = tokenizer(prompt, return_tensors="ms").input_ids
-
-            generated_ids = model.generate(input_ids, max_length=10)
-
-            generated_string = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-            predicted_outputs += generated_string
-
-        self.assertListEqual(predicted_outputs, EXPECTED_OUTPUTS)
-
-    def test_batch_generation(self):
-        model_id = "facebook/opt-350m"
-
-        tokenizer = GPT2Tokenizer.from_pretrained(model_id)
-        model = OPTForCausalLM.from_pretrained(model_id)
-        model
-
-        tokenizer.padding_side = "left"
-
-        # use different length sentences to test batching
-        sentences = [
-            "Hello, my dog is a little",
-            "Today, I",
-        ]
-
-        inputs = tokenizer(sentences, return_tensors="ms", padding=True)
-        input_ids = inputs["input_ids"]
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"],
-        )
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="ms").input_ids
-        output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
-        inputs_padded = tokenizer(sentences[1], return_tensors="ms").input_ids
-        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "Hello, my dog is a little bit of a dork.\nI'm a little bit",
-            "Today, I was in the middle of a conversation with a friend about the",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [non_padded_sentence, padded_sentence])
-
-    def test_generation_post_attn_layer_norm(self):
-        model_id = "facebook/opt-350m"
-
-        EXPECTED_OUTPUTS = [
-            "Today is a beautiful day and I want to",
-            "In the city of San Francisco, the city",
-            "Paris is the capital of France and the capital",
-            "Computers and mobile phones have taken over the",
-        ]
-
-        predicted_outputs = []
-        tokenizer = GPT2Tokenizer.from_pretrained(model_id)
-        model = OPTForCausalLM.from_pretrained(model_id)
-
-        for prompt in self.prompts:
-            input_ids = tokenizer(prompt, return_tensors="ms").input_ids
-
-            generated_ids = model.generate(input_ids, max_length=10)
-
-            generated_string = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-            predicted_outputs += generated_string
-
-        self.assertListEqual(predicted_outputs, EXPECTED_OUTPUTS)
-
-    @require_mindspore
-    def test_batched_nan_fp16(self):
-        # a bug manifested starting at models facebook/opt-1.3 and larger when running batched generations,
-        # therefore not using a tiny model, but the smallest model the problem was seen with which is opt-1.3b.
-        # please refer to this github thread: https://github.com/huggingface/transformers/pull/17437 for more details
-        model_name = "facebook/opt-1.3b"
-        tokenizer = GPT2Tokenizer.from_pretrained(model_name, use_fast=False, padding_side="left")
-
-        model = OPTForCausalLM.from_pretrained(model_name, ms_dtype=mindspore.float16, use_cache=True)
-        model = model.set_train(False)
-
-        batch = tokenizer(["Who are you?", "Joe Biden is the president of"], padding=True, return_tensors="ms")
-
-        input_ids = batch["input_ids"]
-        attention_mask = batch["attention_mask"]
-
-        outputs = model(input_ids, attention_mask=attention_mask)
-        self.assertFalse(
-            ops.isnan(outputs.logits[0]).any().item()
-        )  # the first logits could contain NaNs if it fails
-
-    @slow
-    def test_contrastive_search_opt(self):
-        article = (
-            "A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the "
-            "Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived "
-            "there?"
-        )
-
-        opt_tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-1.3b", ms_dtype=mindspore.float32)
-        opt_model = OPTForCausalLM.from_pretrained("facebook/opt-1.3b")
-        input_ids = opt_tokenizer(article, return_tensors="ms").input_ids
-
-        outputs = opt_model.generate(input_ids, penalty_alpha=0.6, top_k=5, max_length=256)
-        generated_text = opt_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I "
-                "am the Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have "
-                "you lived there?\nStatue: A hundred years.\nHuman: And you’re from what country?\nStatue: The United "
-                "States of America.\nHuman: Why did you come to America?\nStatue: I came to escape the tyranny of my "
-                "country.\nHuman: What tyranny?\nStatue: They didn’t let me speak my mind.\nHuman: What was your "
-                "country?\nStatue: It was a country of immigrants.\nHuman: Who were the immigrants?\nStatue: They "
-                "were from all over the world.\nHuman: What language did they speak?\nStatue: French, Spanish, "
-                "Italian, German, English—you name it.\nHuman: And where did they come from?\nStatue: They came from "
-                "every country in the world.\nHuman: And you were born in what country?\nStatue: I was born in "
-                "France.\nHuman: And your parents were French?\nStatue"
-            ],
-        )
diff --git a/tests/transformers/models/owlv2/__init__.py b/tests/transformers/models/owlv2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/owlv2/test_image_processor_owlv2.py b/tests/transformers/models/owlv2/test_image_processor_owlv2.py
deleted file mode 100644
index 7740dde8c..000000000
--- a/tests/transformers/models/owlv2/test_image_processor_owlv2.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import is_mindspore_available, is_vision_available
-import numpy as np
-from ...test_image_processing_common import (
-    ImageProcessingTestMixin,
-    prepare_image_inputs,
-)
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import (
-        AutoProcessor,
-        Owlv2ForObjectDetection,
-        Owlv2ImageProcessor,
-    )
-
-if is_mindspore_available():
-    import mindspore
-
-
-class Owlv2ImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.48145466, 0.4578275, 0.40821073],
-        image_std=[0.26862954, 0.26130258, 0.27577711],
-        do_convert_rgb=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size if size is not None else {"height": 18, "width": 18}
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_convert_rgb = do_convert_rgb
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.size["height"], self.size["width"]
-
-    def prepare_image_inputs(
-        self, equal_resolution=False, numpify=False, torchify=False
-    ):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = Owlv2ImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = Owlv2ImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict
-        )
-        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size={"height": 42, "width": 42}
-        )
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
-
-    @slow
-    def test_image_processor_integration_test(self):
-        processor = Owlv2ImageProcessor()
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        pixel_values = processor(image, return_tensors="ms").pixel_values
-
-        mean_value = round(pixel_values.mean().item(), 4)
-        self.assertEqual(mean_value, 0.2353)
-
-    @slow
-    def test_image_processor_integration_test_resize(self):
-        checkpoint = "google/owlv2-base-patch16-ensemble"
-        processor = AutoProcessor.from_pretrained(checkpoint)
-        model = Owlv2ForObjectDetection.from_pretrained(checkpoint)
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        text = ["cat"]
-        target_size = image.size[::-1]
-        expected_boxes = mindspore.tensor(
-            [
-                [
-                    341.66656494140625,
-                    23.38756561279297,
-                    642.321044921875,
-                    371.3482971191406,
-                ],
-                [
-                    6.753320693969727,
-                    51.96149826049805,
-                    326.61810302734375,
-                    473.12982177734375,
-                ],
-            ]
-        )
-
-        # single image
-        inputs = processor(text=[text], images=[image], return_tensors="ms")
-
-        outputs = model(**inputs)
-
-        results = processor.post_process_object_detection(
-            outputs, threshold=0.2, target_sizes=[target_size]
-        )[0]
-
-        boxes = results["boxes"]
-        self.assertTrue(
-            np.allclose(
-                boxes.asnumpy(), expected_boxes.asnumpy(), atol=1e-1
-            ),
-            f"Single image bounding boxes fail. Expected {expected_boxes}, got {boxes}",
-        )
-
-        # batch of images
-        inputs = processor(
-            text=[text, text], images=[image, image], return_tensors="ms"
-        )
-
-        outputs = model(**inputs)
-        results = processor.post_process_object_detection(
-            outputs, threshold=0.2, target_sizes=[target_size, target_size]
-        )
-
-        for result in results:
-            boxes = result["boxes"]
-            self.assertTrue(
-                np.allclose(boxes.asnumpy(), expected_boxes.asnumpy(), atol=1e-1),
-                f"Batch image bounding boxes fail. Expected {expected_boxes}, got {boxes}",
-            )
-
-    @unittest.skip(
-        "OWLv2 doesn't treat 4 channel PIL and numpy consistently yet"
-    )  # FIXME Amy
-    def test_call_numpy_4_channels(self):
-        pass
diff --git a/tests/transformers/models/owlv2/test_modeling_owlv2.py b/tests/transformers/models/owlv2/test_modeling_owlv2.py
deleted file mode 100644
index 3f5a87e89..000000000
--- a/tests/transformers/models/owlv2/test_modeling_owlv2.py
+++ /dev/null
@@ -1,770 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Owlv2 model."""
-
-import inspect
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import requests
-
-from mindnlp.transformers import Owlv2Config, Owlv2TextConfig, Owlv2VisionConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import Owlv2ForObjectDetection, Owlv2Model, Owlv2TextModel, Owlv2VisionModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import OwlViTProcessor
-
-
-# Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTVisionModelTester with OwlViT->Owlv2
-class Owlv2VisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=32,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return Owlv2VisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = Owlv2VisionModel(config=config)
-        model.eval()
-
-        pixel_values = pixel_values.to(mindspore.float32)
-
-        with no_grad():
-            result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (self.image_size // self.patch_size) ** 2
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-# Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTVisionModelTest with OwlViT->Owlv2, OWL-ViT->OwlV2, OWLVIT->OWLV2, owlvit-base-patch32->owlv2-base-patch16-ensemble
-class Owlv2VisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as OWLV2 does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (Owlv2VisionModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = Owlv2VisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=Owlv2VisionConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="OWLV2 does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="OwlV2 does not support training yet")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="OwlV2 does not support training yet")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="Owlv2VisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Owlv2VisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/owlv2-base-patch16-ensemble"
-        model = Owlv2VisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTTextModelTester with OwlViT->Owlv2
-class Owlv2TextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        num_queries=4,
-        seq_length=16,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=64,
-        num_hidden_layers=12,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=16,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_queries = num_queries
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size * self.num_queries, self.seq_length], self.vocab_size)
-        input_mask = None
-
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size * self.num_queries, self.seq_length])
-
-        if input_mask is not None:
-            num_text, seq_length = input_mask.shape
-
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(num_text,))
-            for idx, start_index in enumerate(rnd_start_indices):
-                input_mask[idx, :int(start_index)] = 1
-                input_mask[idx, int(start_index):] = 0
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return Owlv2TextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = Owlv2TextModel(config=config)
-        model.eval()
-        with no_grad():
-            result = model(input_ids=input_ids, attention_mask=input_mask)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size * self.num_queries, self.seq_length, self.hidden_size)
-        )
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size * self.num_queries, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-# Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTTextModelTest with OwlViT->Owlv2, OWL-ViT->OwlV2, OWLVIT->OWLV2, owlvit-base-patch32->owlv2-base-patch16-ensemble
-class Owlv2TextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (Owlv2TextModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = Owlv2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Owlv2TextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="OwlV2 does not support training yet")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="OwlV2 does not support training yet")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="OWLV2 does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Owlv2TextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Owlv2TextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/owlv2-base-patch16-ensemble"
-        model = Owlv2TextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class Owlv2ModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = Owlv2TextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = Owlv2VisionModelTester(parent, **vision_kwargs)
-        self.is_training = is_training
-        self.text_config = self.text_model_tester.get_config().to_dict()
-        self.vision_config = self.vision_model_tester.get_config().to_dict()
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-        config = self.get_config()
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return Owlv2Config.from_text_vision_configs(self.text_config, self.vision_config, projection_dim=64)
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = Owlv2Model(config).eval()
-
-        with no_grad():
-            result = model(
-                input_ids=input_ids,
-                pixel_values=pixel_values,
-                attention_mask=attention_mask,
-            )
-
-        image_logits_size = (
-            self.vision_model_tester.batch_size,
-            self.text_model_tester.batch_size * self.text_model_tester.num_queries,
-        )
-        text_logits_size = (
-            self.text_model_tester.batch_size * self.text_model_tester.num_queries,
-            self.vision_model_tester.batch_size,
-        )
-        self.parent.assertEqual(result.logits_per_image.shape, image_logits_size)
-        self.parent.assertEqual(result.logits_per_text.shape, text_logits_size)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "return_loss": False,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-# Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTModelTest with OwlViT->Owlv2, OWL-ViT->OwlV2, OWLVIT->OWLV2, owlvit-base-patch32->owlv2-base-patch16-ensemble
-class Owlv2ModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (Owlv2Model,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": Owlv2Model,
-            "zero-shot-object-detection": Owlv2ForObjectDetection,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = Owlv2ModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="Owlv2Model does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # override as the `logit_scale` parameter initilization is different for OWLV2
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save Owlv2Config and check if we can load Owlv2VisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = Owlv2VisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save Owlv2Config and check if we can load Owlv2TextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = Owlv2TextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/owlv2-base-patch16-ensemble"
-        model = Owlv2Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTForObjectDetectionTester with OwlViT->Owlv2, OWL-ViT->OwlV2, OWLVIT->OWLV2
-class Owlv2ForObjectDetectionTester:
-    def __init__(self, parent, is_training=True):
-        self.parent = parent
-        self.text_model_tester = Owlv2TextModelTester(parent)
-        self.vision_model_tester = Owlv2VisionModelTester(parent)
-        self.is_training = is_training
-        self.text_config = self.text_model_tester.get_config().to_dict()
-        self.vision_config = self.vision_model_tester.get_config().to_dict()
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-        config = self.get_config()
-        return config, pixel_values, input_ids, attention_mask
-
-    def get_config(self):
-        return Owlv2Config.from_text_vision_configs(self.text_config, self.vision_config, projection_dim=64)
-
-    def create_and_check_model(self, config, pixel_values, input_ids, attention_mask):
-        model = Owlv2ForObjectDetection(config).eval()
-        with no_grad():
-            result = model(
-                pixel_values=pixel_values,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                return_dict=True,
-            )
-
-        pred_boxes_size = (
-            self.vision_model_tester.batch_size,
-            (self.vision_model_tester.image_size // self.vision_model_tester.patch_size) ** 2,
-            4,
-        )
-        pred_logits_size = (
-            self.vision_model_tester.batch_size,
-            (self.vision_model_tester.image_size // self.vision_model_tester.patch_size) ** 2,
-            4,
-        )
-        pred_class_embeds_size = (
-            self.vision_model_tester.batch_size,
-            (self.vision_model_tester.image_size // self.vision_model_tester.patch_size) ** 2,
-            self.text_model_tester.hidden_size,
-        )
-        self.parent.assertEqual(result.pred_boxes.shape, pred_boxes_size)
-        self.parent.assertEqual(result.logits.shape, pred_logits_size)
-        self.parent.assertEqual(result.class_embeds.shape, pred_class_embeds_size)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, input_ids, attention_mask = config_and_inputs
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@unittest.skipIf(mindspore.get_context('device_target') == 'CPU', 'CPU casuse some error')
-@require_mindspore
-# Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTForObjectDetectionTest with OwlViT->Owlv2, OWL-ViT->OwlV2, OWLVIT->OWLV2, owlvit-base-patch32->owlv2-base-patch16-ensemble
-class Owlv2ForObjectDetectionTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (Owlv2ForObjectDetection,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = Owlv2ForObjectDetectionTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="Owlv2Model does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Test_initialization is tested in individual model tests")
-    def test_initialization(self):
-        pass
-
-    @unittest.skip(reason="Test_forward_signature is tested in individual model tests")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip(reason="Test_save_load_fast_init_from_base is tested in individual model tests")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="OwlV2 does not support training yet")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="OwlV2 does not support training yet")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/owlv2-base-patch16-ensemble"
-        model = Owlv2ForObjectDetection.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_vision
-@require_mindspore
-class Owlv2ModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "google/owlv2-base-patch16"
-        model = Owlv2Model.from_pretrained(model_name)
-        processor = OwlViTProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        inputs = processor(
-            text=[["a photo of a cat", "a photo of a dog"]],
-            images=image,
-            max_length=16,
-            padding="max_length",
-            return_tensors="ms",
-        )
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(
-            outputs.logits_per_image.shape,
-            (inputs.pixel_values.shape[0], inputs.input_ids.shape[0]),
-        )
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-           (inputs.input_ids.shape[0], inputs.pixel_values.shape[0]),
-        )
-        expected_logits = mindspore.tensor([[-6.2229, -8.2601]])
-        self.assertTrue(ops.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
-
-    @slow
-    def test_inference_object_detection(self):
-        model_name = "google/owlv2-base-patch16"
-        model = Owlv2ForObjectDetection.from_pretrained(model_name)
-
-        processor = OwlViTProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        inputs = processor(
-            text=[["a photo of a cat", "a photo of a dog"]],
-            images=image,
-            max_length=16,
-            padding="max_length",
-            return_tensors="ms",
-        )
-
-        with no_grad():
-            outputs = model(**inputs)
-
-        num_queries = int((model.config.vision_config.image_size / model.config.vision_config.patch_size) ** 2)
-        self.assertEqual(outputs.pred_boxes.shape, (1, num_queries, 4))
-
-        expected_slice_logits = mindspore.tensor(
-            [[-21.413497, -21.612638], [-19.008193, -19.548841], [-20.958896, -21.382694]]
-        )
-        self.assertTrue(ops.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
-        expected_slice_boxes = mindspore.tensor(
-            [[0.241309, 0.051896, 0.453267], [0.139474, 0.045701, 0.250660], [0.233022, 0.050479, 0.427671]],
-        )
-        self.assertTrue(ops.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
-
-    @slow
-    def test_inference_one_shot_object_detection(self):
-        model_name = "google/owlv2-base-patch16"
-        model = Owlv2ForObjectDetection.from_pretrained(model_name)
-
-        processor = OwlViTProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        query_image = prepare_img()
-        inputs = processor(
-            images=image,
-            query_images=query_image,
-            max_length=16,
-            padding="max_length",
-            return_tensors="ms",
-        )
-
-        with no_grad():
-            outputs = model.image_guided_detection(**inputs)
-
-        num_queries = int((model.config.vision_config.image_size / model.config.vision_config.patch_size) ** 2)
-        self.assertEqual(outputs.target_pred_boxes.shape, (1, num_queries, 4))
-
-        expected_slice_boxes = mindspore.tensor(
-            [[0.2413, 0.0519, 0.4533], [0.1395, 0.0457, 0.2507], [0.2330, 0.0505, 0.4277]],
-        )
-        self.assertTrue(ops.allclose(outputs.target_pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
-
-    @slow
-    @require_mindspore
-    def test_inference_one_shot_object_detection_fp16(self):
-        model_name = "google/owlv2-base-patch16"
-        model = Owlv2ForObjectDetection.from_pretrained(model_name, torch_dtype=mindspore.float16)
-
-        processor = OwlViTProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        query_image = prepare_img()
-        inputs = processor(
-            images=image,
-            query_images=query_image,
-            max_length=16,
-            padding="max_length",
-            return_tensors="ms",
-        )
-
-        with no_grad():
-            outputs = model.image_guided_detection(**inputs)
-
-        # No need to check the logits, we just check inference runs fine.
-        num_queries = int((model.config.vision_config.image_size / model.config.vision_config.patch_size) ** 2)
-        self.assertEqual(outputs.target_pred_boxes.shape, (1, num_queries, 4))
\ No newline at end of file
diff --git a/tests/transformers/models/owlvit/__init__.py b/tests/transformers/models/owlvit/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/owlvit/test_image_processing_owlvit.py b/tests/transformers/models/owlvit/test_image_processing_owlvit.py
deleted file mode 100644
index 6386b0818..000000000
--- a/tests/transformers/models/owlvit/test_image_processing_owlvit.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================
-"""Testing suite for the OwlViT image processing."""
-
-
-import unittest
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils import is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_vision_available():
-    from mindnlp.transformers import OwlViTImageProcessor
-
-
-class OwlViTImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_center_crop=True,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=[0.48145466, 0.4578275, 0.40821073],
-        image_std=[0.26862954, 0.26130258, 0.27577711],
-        do_convert_rgb=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size if size is not None else {"height": 18, "width": 18}
-        self.do_center_crop = do_center_crop
-        self.crop_size = crop_size if crop_size is not None else {
-            "height": 18, "width": 18}
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_convert_rgb = do_convert_rgb
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_center_crop": self.do_center_crop,
-            "crop_size": self.crop_size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_convert_rgb": self.do_convert_rgb,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class OwlViTImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = OwlViTImageProcessor
-
-    def setUp(self):
-        self.image_processor_tester = OwlViTImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(
-            **self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_center_crop"))
-        self.assertTrue(hasattr(image_processing, "center_crop"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
-        self.assertEqual(image_processor.crop_size, {
-                         "height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42, crop_size=84)
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
-        self.assertEqual(image_processor.crop_size, {
-                         "height": 84, "width": 84})
diff --git a/tests/transformers/models/owlvit/test_modeling_owlvit.py b/tests/transformers/models/owlvit/test_modeling_owlvit.py
deleted file mode 100644
index df7f9fa29..000000000
--- a/tests/transformers/models/owlvit/test_modeling_owlvit.py
+++ /dev/null
@@ -1,758 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore OwlViT model."""
-
-import inspect
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import requests
-
-from mindnlp.transformers import OwlViTConfig, OwlViTTextConfig, OwlViTVisionConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import OwlViTForObjectDetection, OwlViTModel, OwlViTTextModel, OwlViTVisionModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import OwlViTProcessor
-
-
-class OwlViTVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=32,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return OwlViTVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = OwlViTVisionModel(config=config)
-        model.eval()
-
-        pixel_values = pixel_values.to(mindspore.float32)
-
-        with no_grad():
-            result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (self.image_size // self.patch_size) ** 2
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class OwlViTVisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as OWLVIT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (OwlViTVisionModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = OwlViTVisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=OwlViTVisionConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="OWLVIT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="OWL-ViT does not support training yet")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="OWL-ViT does not support training yet")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="OwlViTVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="OwlViTVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/owlvit-base-patch32"
-        model = OwlViTVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class OwlViTTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        num_queries=4,
-        seq_length=16,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=64,
-        num_hidden_layers=12,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=16,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_queries = num_queries
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size * self.num_queries, self.seq_length], self.vocab_size)
-        input_mask = None
-
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size * self.num_queries, self.seq_length])
-
-        if input_mask is not None:
-            num_text, seq_length = input_mask.shape
-
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(num_text,))
-            for idx, start_index in enumerate(rnd_start_indices):
-                input_mask[idx, :int(start_index)] = 1
-                input_mask[idx, int(start_index):] = 0
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return OwlViTTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = OwlViTTextModel(config=config)
-        model.eval()
-        with no_grad():
-            result = model(input_ids=input_ids, attention_mask=input_mask)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size * self.num_queries, self.seq_length, self.hidden_size)
-        )
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size * self.num_queries, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class OwlViTTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (OwlViTTextModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = OwlViTTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=OwlViTTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="OWL-ViT does not support training yet")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="OWL-ViT does not support training yet")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="OWLVIT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="OwlViTTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="OwlViTTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/owlvit-base-patch32"
-        model = OwlViTTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class OwlViTModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = OwlViTTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = OwlViTVisionModelTester(parent, **vision_kwargs)
-        self.is_training = is_training
-        self.text_config = self.text_model_tester.get_config().to_dict()
-        self.vision_config = self.vision_model_tester.get_config().to_dict()
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-        config = self.get_config()
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return OwlViTConfig.from_text_vision_configs(self.text_config, self.vision_config, projection_dim=64)
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = OwlViTModel(config).eval()
-
-        with no_grad():
-            result = model(
-                input_ids=input_ids,
-                pixel_values=pixel_values,
-                attention_mask=attention_mask,
-            )
-
-        image_logits_size = (
-            self.vision_model_tester.batch_size,
-            self.text_model_tester.batch_size * self.text_model_tester.num_queries,
-        )
-        text_logits_size = (
-            self.text_model_tester.batch_size * self.text_model_tester.num_queries,
-            self.vision_model_tester.batch_size,
-        )
-        self.parent.assertEqual(result.logits_per_image.shape, image_logits_size)
-        self.parent.assertEqual(result.logits_per_text.shape, text_logits_size)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "return_loss": False,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class OwlViTModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (OwlViTModel,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": OwlViTModel,
-            "zero-shot-object-detection": OwlViTForObjectDetection,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = OwlViTModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="OwlViTModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # override as the `logit_scale` parameter initilization is different for OWLVIT
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save OwlViTConfig and check if we can load OwlViTVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = OwlViTVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save OwlViTConfig and check if we can load OwlViTTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = OwlViTTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/owlvit-base-patch32"
-        model = OwlViTModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class OwlViTForObjectDetectionTester:
-    def __init__(self, parent, is_training=True):
-        self.parent = parent
-        self.text_model_tester = OwlViTTextModelTester(parent)
-        self.vision_model_tester = OwlViTVisionModelTester(parent)
-        self.is_training = is_training
-        self.text_config = self.text_model_tester.get_config().to_dict()
-        self.vision_config = self.vision_model_tester.get_config().to_dict()
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
-        config = self.get_config()
-        return config, pixel_values, input_ids, attention_mask
-
-    def get_config(self):
-        return OwlViTConfig.from_text_vision_configs(self.text_config, self.vision_config, projection_dim=64)
-
-    def create_and_check_model(self, config, pixel_values, input_ids, attention_mask):
-        model = OwlViTForObjectDetection(config).eval()
-        with no_grad():
-            result = model(
-                pixel_values=pixel_values,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                return_dict=True,
-            )
-
-        pred_boxes_size = (
-            self.vision_model_tester.batch_size,
-            (self.vision_model_tester.image_size // self.vision_model_tester.patch_size) ** 2,
-            4,
-        )
-        pred_logits_size = (
-            self.vision_model_tester.batch_size,
-            (self.vision_model_tester.image_size // self.vision_model_tester.patch_size) ** 2,
-            4,
-        )
-        pred_class_embeds_size = (
-            self.vision_model_tester.batch_size,
-            (self.vision_model_tester.image_size // self.vision_model_tester.patch_size) ** 2,
-            self.text_model_tester.hidden_size,
-        )
-        self.parent.assertEqual(result.pred_boxes.shape, pred_boxes_size)
-        self.parent.assertEqual(result.logits.shape, pred_logits_size)
-        self.parent.assertEqual(result.class_embeds.shape, pred_class_embeds_size)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, input_ids, attention_mask = config_and_inputs
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@unittest.skipIf(mindspore.get_context('device_target') == 'CPU', 'CPU casuse some error')
-@require_mindspore
-class OwlViTForObjectDetectionTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (OwlViTForObjectDetection,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-
-    def setUp(self):
-        self.model_tester = OwlViTForObjectDetectionTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="OwlViTModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Test_initialization is tested in individual model tests")
-    def test_initialization(self):
-        pass
-
-    @unittest.skip(reason="Test_forward_signature is tested in individual model tests")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip(reason="Test_save_load_fast_init_from_base is tested in individual model tests")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="OWL-ViT does not support training yet")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="OWL-ViT does not support training yet")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/owlvit-base-patch32"
-        model = OwlViTForObjectDetection.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_vision
-@require_mindspore
-class OwlViTModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "google/owlvit-base-patch32"
-        model = OwlViTModel.from_pretrained(model_name)
-        processor = OwlViTProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        inputs = processor(
-            text=[["a photo of a cat", "a photo of a dog"]],
-            images=image,
-            max_length=16,
-            padding="max_length",
-            return_tensors="ms",
-        )
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(
-            outputs.logits_per_image.shape,
-            (inputs.pixel_values.shape[0], inputs.input_ids.shape[0]),
-        )
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            (inputs.input_ids.shape[0], inputs.pixel_values.shape[0]),
-        )
-        expected_logits = mindspore.tensor([[3.4613, 0.9403]])
-        self.assertTrue(ops.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
-
-    @slow
-    def test_inference_object_detection(self):
-        model_name = "google/owlvit-base-patch32"
-        model = OwlViTForObjectDetection.from_pretrained(model_name)
-
-        processor = OwlViTProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        inputs = processor(
-            text=[["a photo of a cat", "a photo of a dog"]],
-            images=image,
-            max_length=16,
-            padding="max_length",
-            return_tensors="ms",
-        )
-
-        with no_grad():
-            outputs = model(**inputs)
-
-        num_queries = int((model.config.vision_config.image_size / model.config.vision_config.patch_size) ** 2)
-        self.assertEqual(outputs.pred_boxes.shape, (1, num_queries, 4))
-
-        expected_slice_boxes = mindspore.tensor(
-            [[0.0691, 0.0445, 0.1373], [0.1592, 0.0456, 0.3192], [0.1632, 0.0423, 0.2478]]
-        )
-        self.assertTrue(ops.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
-
-    @slow
-    def test_inference_one_shot_object_detection(self):
-        model_name = "google/owlvit-base-patch32"
-        model = OwlViTForObjectDetection.from_pretrained(model_name)
-
-        processor = OwlViTProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        query_image = prepare_img()
-        inputs = processor(
-            images=image,
-            query_images=query_image,
-            max_length=16,
-            padding="max_length",
-            return_tensors="ms",
-        )
-
-        with no_grad():
-            outputs = model.image_guided_detection(**inputs)
-
-        num_queries = int((model.config.vision_config.image_size / model.config.vision_config.patch_size) ** 2)
-        self.assertEqual(outputs.target_pred_boxes.shape, (1, num_queries, 4))
-
-        expected_slice_boxes = mindspore.tensor(
-            [[0.0691, 0.0445, 0.1373], [0.1592, 0.0456, 0.3192], [0.1632, 0.0423, 0.2478]]
-        )
-        self.assertTrue(ops.allclose(outputs.target_pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
-
-    @slow
-    def test_inference_one_shot_object_detection_fp16(self):
-        model_name = "google/owlvit-base-patch32"
-        model = OwlViTForObjectDetection.from_pretrained(model_name, torch_dtype=mindspore.float16)
-
-        processor = OwlViTProcessor.from_pretrained(model_name)
-
-        image = prepare_img()
-        query_image = prepare_img()
-        inputs = processor(
-            images=image,
-            query_images=query_image,
-            max_length=16,
-            padding="max_length",
-            return_tensors="ms",
-        )
-
-        with no_grad():
-            outputs = model.image_guided_detection(**inputs)
-
-        # No need to check the logits, we just check inference runs fine.
-        num_queries = int((model.config.vision_config.image_size / model.config.vision_config.patch_size) ** 2)
-        self.assertEqual(outputs.target_pred_boxes.shape, (1, num_queries, 4))
\ No newline at end of file
diff --git a/tests/transformers/models/owlvit/test_processor_owlvit.py b/tests/transformers/models/owlvit/test_processor_owlvit.py
deleted file mode 100644
index 57297a654..000000000
--- a/tests/transformers/models/owlvit/test_processor_owlvit.py
+++ /dev/null
@@ -1,293 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================
-"""Testing suite for the MindSpore OwlViT Processor."""
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-import pytest
-
-from mindnlp.transformers import CLIPTokenizer, CLIPTokenizerFast
-from mindnlp.transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
-from mindnlp.utils.testing_utils import require_vision
-from mindnlp.utils import is_vision_available
-from mindnlp.configs import IMAGE_PROCESSOR_NAME
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import OwlViTImageProcessor, OwlViTProcessor
-
-
-@require_vision
-class OwlViTProcessorTest(unittest.TestCase):
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-
-        vocab = ["", "l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>",
-                 "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]  # fmt: skip
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        self.vocab_file = os.path.join(
-            self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(
-            self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-        image_processor_map = {
-            "do_resize": True,
-            "size": 20,
-            "do_center_crop": True,
-            "crop_size": 18,
-            "do_normalize": True,
-            "image_mean": [0.48145466, 0.4578275, 0.40821073],
-            "image_std": [0.26862954, 0.26130258, 0.27577711],
-        }
-        self.image_processor_file = os.path.join(
-            self.tmpdirname, IMAGE_PROCESSOR_NAME)
-        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
-            json.dump(image_processor_map, fp)
-
-    def get_tokenizer(self, **kwargs):
-        return CLIPTokenizer.from_pretrained(self.tmpdirname, pad_token="!", **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        return CLIPTokenizerFast.from_pretrained(self.tmpdirname, pad_token="!", **kwargs)
-
-    def get_image_processor(self, **kwargs):
-        return OwlViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def prepare_image_inputs(self):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        image_inputs = [np.random.randint(
-            255, size=(3, 30, 400), dtype=np.uint8)]
-
-        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1))
-                        for x in image_inputs]
-
-        return image_inputs
-
-    def test_save_load_pretrained_default(self):
-        tokenizer_slow = self.get_tokenizer()
-        tokenizer_fast = self.get_rust_tokenizer()
-        image_processor = self.get_image_processor()
-
-        processor_slow = OwlViTProcessor(
-            tokenizer=tokenizer_slow, image_processor=image_processor)
-        processor_slow.save_pretrained(self.tmpdirname)
-        processor_slow = OwlViTProcessor.from_pretrained(
-            self.tmpdirname, use_fast=False)
-
-        processor_fast = OwlViTProcessor(
-            tokenizer=tokenizer_fast, image_processor=image_processor)
-        processor_fast.save_pretrained(self.tmpdirname)
-        processor_fast = OwlViTProcessor.from_pretrained(self.tmpdirname)
-
-        self.assertEqual(processor_slow.tokenizer.get_vocab(),
-                         tokenizer_slow.get_vocab())
-        self.assertEqual(processor_fast.tokenizer.get_vocab(),
-                         tokenizer_fast.get_vocab())
-        self.assertEqual(tokenizer_slow.get_vocab(),
-                         tokenizer_fast.get_vocab())
-        self.assertIsInstance(processor_slow.tokenizer, CLIPTokenizer)
-        self.assertIsInstance(processor_fast.tokenizer, CLIPTokenizerFast)
-
-        self.assertEqual(processor_slow.image_processor.to_json_string(
-        ), image_processor.to_json_string())
-        self.assertEqual(processor_fast.image_processor.to_json_string(
-        ), image_processor.to_json_string())
-        self.assertIsInstance(
-            processor_slow.image_processor, OwlViTImageProcessor)
-        self.assertIsInstance(
-            processor_fast.image_processor, OwlViTImageProcessor)
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = OwlViTProcessor(
-            tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(
-            bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_image_processor(
-            do_normalize=False)
-
-        processor = OwlViTProcessor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", pad_token="!", do_normalize=False
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(),
-                         tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, CLIPTokenizerFast)
-
-        self.assertEqual(processor.image_processor.to_json_string(),
-                         image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, OwlViTImageProcessor)
-
-    def test_image_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = OwlViTProcessor(
-            tokenizer=tokenizer, image_processor=image_processor)
-
-        image_input = self.prepare_image_inputs()
-
-        input_image_proc = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
-
-        for key in input_image_proc.keys():
-            self.assertAlmostEqual(input_image_proc[key].sum(
-            ), input_processor[key].sum(), delta=1e-2)
-
-    def test_tokenizer(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = OwlViTProcessor(
-            tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-
-        encoded_processor = processor(text=input_str, return_tensors="np")
-
-        encoded_tok = tokenizer(input_str, return_tensors="np")
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(
-                encoded_tok[key][0].tolist(), encoded_processor[key][0].tolist())
-
-    def test_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = OwlViTProcessor(
-            tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertListEqual(list(inputs.keys()), [
-                             "input_ids", "attention_mask", "pixel_values"])
-
-        # test if it raises when no input is passed
-        with pytest.raises(ValueError):
-            processor()
-
-    def test_processor_with_text_list(self):
-        model_name = "google/owlvit-base-patch32"
-        processor = OwlViTProcessor.from_pretrained(model_name)
-
-        input_text = ["cat", "nasa badge"]
-        inputs = processor(text=input_text)
-
-        seq_length = 16
-        self.assertListEqual(list(inputs.keys()), [
-                             "input_ids", "attention_mask"])
-        self.assertEqual(inputs["input_ids"].shape, (2, seq_length))
-
-        # test if it raises when no input is passed
-        with pytest.raises(ValueError):
-            processor()
-
-    def test_processor_with_nested_text_list(self):
-        model_name = "google/owlvit-base-patch32"
-        processor = OwlViTProcessor.from_pretrained(model_name)
-
-        input_texts = [["cat", "nasa badge"], ["person"]]
-        inputs = processor(text=input_texts)
-
-        seq_length = 16
-        batch_size = len(input_texts)
-        num_max_text_queries = max(len(texts) for texts in input_texts)
-
-        self.assertListEqual(list(inputs.keys()), [
-                             "input_ids", "attention_mask"])
-        self.assertEqual(inputs["input_ids"].shape,
-                         (batch_size * num_max_text_queries, seq_length))
-
-        # test if it raises when no input is passed
-        with pytest.raises(ValueError):
-            processor()
-
-    def test_processor_case(self):
-        model_name = "google/owlvit-base-patch32"
-        processor = OwlViTProcessor.from_pretrained(model_name)
-
-        input_texts = ["cat", "nasa badge"]
-        inputs = processor(text=input_texts)
-
-        seq_length = 16
-        input_ids = inputs["input_ids"]
-        predicted_ids = [
-            [49406, 2368, 49407, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-            [49406, 6841, 11301, 49407, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-        ]
-
-        self.assertListEqual(list(inputs.keys()), [
-                             "input_ids", "attention_mask"])
-        self.assertEqual(inputs["input_ids"].shape, (2, seq_length))
-        self.assertListEqual(list(input_ids[0]), predicted_ids[0])
-        self.assertListEqual(list(input_ids[1]), predicted_ids[1])
-
-    def test_processor_case2(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = OwlViTProcessor(
-            tokenizer=tokenizer, image_processor=image_processor)
-
-        image_input = self.prepare_image_inputs()
-        query_input = self.prepare_image_inputs()
-
-        inputs = processor(images=image_input, query_images=query_input)
-
-        self.assertListEqual(list(inputs.keys()), [
-                             "query_pixel_values", "pixel_values"])
-
-        # test if it raises when no input is passed
-        with pytest.raises(ValueError):
-            processor()
-
-    def test_tokenizer_decode(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = OwlViTProcessor(
-            tokenizer=tokenizer, image_processor=image_processor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
diff --git a/tests/transformers/models/patchtst/__init__.py b/tests/transformers/models/patchtst/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/patchtst/test_modeling_patchtst.py b/tests/transformers/models/patchtst/test_modeling_patchtst.py
deleted file mode 100644
index fbb5f868b..000000000
--- a/tests/transformers/models/patchtst/test_modeling_patchtst.py
+++ /dev/null
@@ -1,384 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore PatchTST model."""
-
-import inspect
-import random
-import tempfile
-import unittest
-from unittest import skip
-
-import numpy as np
-from huggingface_hub import hf_hub_download
-
-from mindnlp.utils  import is_mindspore_available
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import is_flaky, require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-TOLERANCE = 1e-4
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
-        MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
-        PatchTSTConfig,
-        PatchTSTForClassification,
-        PatchTSTForPrediction,
-        PatchTSTForPretraining,
-        PatchTSTForRegression,
-        PatchTSTModel,
-    )
-
-
-@require_mindspore
-class PatchTSTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        prediction_length=7,
-        context_length=14,
-        patch_length=5,
-        patch_stride=5,
-        num_input_channels=1,
-        num_time_features=1,
-        is_training=True,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        distil=False,
-        seed=42,
-        num_targets=2,
-        mask_type="random",
-        random_mask_ratio=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.prediction_length = prediction_length
-        self.context_length = context_length
-        self.patch_length = patch_length
-        self.patch_stride = patch_stride
-        self.num_input_channels = num_input_channels
-        self.num_time_features = num_time_features
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.mask_type = mask_type
-        self.random_mask_ratio = random_mask_ratio
-
-        self.seed = seed
-        self.num_targets = num_targets
-        self.distil = distil
-        self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.patch_stride + 1
-        # define seq_length so that it can pass the test_attention_outputs
-        self.seq_length = self.num_patches
-
-    def get_config(self):
-        return PatchTSTConfig(
-            prediction_length=self.prediction_length,
-            patch_length=self.patch_length,
-            patch_stride=self.patch_stride,
-            num_input_channels=self.num_input_channels,
-            d_model=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            context_length=self.context_length,
-            activation_function=self.hidden_act,
-            seed=self.seed,
-            num_targets=self.num_targets,
-            mask_type=self.mask_type,
-            random_mask_ratio=self.random_mask_ratio,
-        )
-
-    def prepare_patchtst_inputs_dict(self, config):
-        _past_length = config.context_length
-        # bs, num_input_channels, num_patch, patch_len
-
-        # [bs x seq_len x num_input_channels]
-        past_values = floats_tensor([self.batch_size, _past_length, self.num_input_channels])
-
-        future_values = floats_tensor([self.batch_size, config.prediction_length, self.num_input_channels])
-
-        inputs_dict = {
-            "past_values": past_values,
-            "future_values": future_values,
-        }
-        return inputs_dict
-
-    def prepare_config_and_inputs(self):
-        config = self.get_config()
-        inputs_dict = self.prepare_patchtst_inputs_dict(config)
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-
-@require_mindspore
-class PatchTSTModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            PatchTSTForPrediction,
-            PatchTSTForPretraining,
-            PatchTSTForClassification,
-            PatchTSTForRegression,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-
-    pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_mindspore_available() else {}
-    is_encoder_decoder = False
-    test_pruning = False
-    test_head_masking = False
-    test_missing_keys = True
-    test_torchscript = False
-    test_inputs_embeds = False
-
-    test_resize_embeddings = True
-    test_resize_position_embeddings = False
-    test_mismatched_shapes = True
-    test_model_parallel = False
-    has_attentions = True
-
-    def setUp(self):
-        self.model_tester = PatchTSTModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=PatchTSTConfig,
-            has_text_modality=False,
-            prediction_length=self.model_tester.prediction_length,
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        #  if PatchTSTForPretraining
-        if model_class == PatchTSTForPretraining:
-            inputs_dict.pop("future_values")
-        # else if classification model:
-        elif model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING):
-            rng = random.Random(self.model_tester.seed)
-            labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_targets, rng=rng)
-            inputs_dict["target_values"] = labels
-            inputs_dict.pop("future_values")
-        elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING):
-            rng = random.Random(self.model_tester.seed)
-            target_values = floats_tensor([self.model_tester.batch_size, self.model_tester.num_targets], rng=rng)
-            inputs_dict["target_values"] = target_values
-            inputs_dict.pop("future_values")
-        return inputs_dict
-
-    def test_save_load_strict(self):
-        config, _ = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            num_patch = self.model_tester.num_patches
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [num_patch, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    @unittest.skip(reason="we have no tokens embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    def test_model_main_input_name(self):
-        model_signature = inspect.signature(getattr(PatchTSTModel, "forward"))
-        # The main input is the name of the argument after `self`
-        observed_main_input_name = list(model_signature.parameters.keys())[1]
-        self.assertEqual(PatchTSTModel.main_input_name, observed_main_input_name)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            if model_class == PatchTSTForPretraining:
-                expected_arg_names = [
-                    "past_values",
-                    "past_observed_mask",
-                ]
-            elif model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values(
-                MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING
-            ):
-                expected_arg_names = ["past_values", "target_values", "past_observed_mask"]
-            else:
-                expected_arg_names = [
-                    "past_values",
-                    "past_observed_mask",
-                    "future_values",
-                ]
-
-            expected_arg_names.extend(
-                [
-                    "output_hidden_states",
-                    "output_attentions",
-                    "return_dict",
-                ]
-            )
-
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    @is_flaky()
-    @skip("ModelTesterMixin class has no attribute 'test_retain_grad_hidden_states_attentions'")
-    def test_retain_grad_hidden_states_attentions(self):
-        super().test_retain_grad_hidden_states_attentions()
-
-    @unittest.skip(reason="Model does not have input embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-
-def prepare_batch(repo_id="hf-internal-testing/etth1-hourly-batch", file="train-batch.pt"):
-    file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset")
-    batch = np.load(file)
-    return batch
-
-
-# Note: Pretrained model is not yet downloadable.
-@require_mindspore
-@slow
-class PatchTSTModelIntegrationTests(unittest.TestCase):
-    # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
-    @unittest.skip('Mindspore cannot load torch .pt file.')
-    def test_pretrain_head(self):
-        model = PatchTSTForPretraining.from_pretrained("namctin/patchtst_etth1_pretrain")
-        batch = prepare_batch()
-
-        mindspore.set_seed(0)
-        output = model(past_values=batch["past_values"]).prediction_output
-        num_patch = (
-            max(model.config.context_length, model.config.patch_length) - model.config.patch_length
-        ) // model.config.patch_stride + 1
-        expected_shape = (64, model.config.num_input_channels, num_patch, model.config.patch_length)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[[-0.0173]], [[-1.0379]], [[-0.1030]], [[0.3642]], [[0.1601]], [[-1.3136]], [[0.8780]]]
-        )
-        self.assertTrue(ops.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE))
-
-    # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
-    @unittest.skip('Mindspore cannot load torch .pt file.')
-    def test_prediction_head(self):
-        model = PatchTSTForPrediction.from_pretrained("namctin/patchtst_etth1_forecast")
-        batch = prepare_batch(file="test-batch.pt")
-
-        mindspore.set_seed(0)
-        output = model(
-            past_values=batch["past_values"],
-            future_values=batch["future_values"],
-            ).prediction_outputs
-        expected_shape = (64, model.config.prediction_length, model.config.num_input_channels)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[0.5142, 0.6928, 0.6118, 0.5724, -0.3735, -0.1336, -0.7124]],
-        )
-        self.assertTrue(ops.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE))
-
-    @unittest.skip('Mindspore cannot load torch .pt file.')
-    def test_prediction_generation(self):
-        model = PatchTSTForPrediction.from_pretrained("namctin/patchtst_etth1_forecast")
-        batch = prepare_batch(file="test-batch.pt")
-
-        mindspore.set_seed(0)
-        outputs = model.generate(past_values=batch["past_values"])
-        expected_shape = (64, 1, model.config.prediction_length, model.config.num_input_channels)
-
-        self.assertEqual(outputs.sequences.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[0.4075, 0.3716, 0.4786, 0.2842, -0.3107, -0.0569, -0.7489]]
-        )
-        mean_prediction = outputs.sequences.mean(dim=1)
-        self.assertTrue(ops.allclose(mean_prediction[0, -1:], expected_slice, atol=TOLERANCE))
-
-    @unittest.skip('Mindspore cannot load torch .pt file.')
-    def test_regression_generation(self):
-        model = PatchTSTForRegression.from_pretrained("ibm/patchtst-etth1-regression-distribution")
-        batch = prepare_batch(repo_id="ibm/patchtst-etth1-test-data", file="regression_distribution_batch.pt")
-
-        mindspore.set_seed(0)
-        model.eval()
-        outputs = model.generate(past_values=batch["past_values"])
-        expected_shape = (64, model.config.num_parallel_samples, model.config.num_targets)
-        self.assertEqual(outputs.sequences.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[-0.08046409], [-0.06570087], [-0.28218266], [-0.20636195], [-0.11787311]]
-        )
-        mean_prediction = outputs.sequences.mean(dim=1)
-        self.assertTrue(ops.allclose(mean_prediction[-5:], expected_slice, rtol=TOLERANCE))
diff --git a/tests/transformers/models/pegasus/__init__.py b/tests/transformers/models/pegasus/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/pegasus/test_modeling_pegasus.py b/tests/transformers/models/pegasus/test_modeling_pegasus.py
deleted file mode 100644
index e08b72225..000000000
--- a/tests/transformers/models/pegasus/test_modeling_pegasus.py
+++ /dev/null
@@ -1,602 +0,0 @@
-# coding=utf-8
-# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore PEGASUS model. """
-
-import tempfile
-import unittest
-
-import numpy as np
-import mindspore.ops
-from mindnlp.transformers import PegasusConfig
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow
-)
-from mindnlp.utils import cached_property, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-from ..mbart.test_modeling_mbart import AbstractSeq2SeqIntegrationTest
-
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import AutoModelForSeq2SeqLM, PegasusForConditionalGeneration, PegasusModel
-    from mindnlp.transformers.models.pegasus.modeling_pegasus import PegasusDecoder, PegasusEncoder, PegasusForCausalLM
-
-
-def prepare_pegasus_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = mindspore.ops.ones((config.encoder_layers, config.encoder_attention_heads))
-    if decoder_head_mask is None:
-        decoder_head_mask = mindspore.ops.ones((config.decoder_layers, config.decoder_attention_heads))
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = mindspore.ops.ones((config.decoder_layers, config.decoder_attention_heads))
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-class PegasusModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-        # forcing a certain token to be generated, sets all other tokens to -inf
-        # if however the token to be generated is already at -inf then it can lead token
-        # `nan` values and thus break generation
-        self.forced_bos_token_id = None
-        self.forced_eos_token_id = None
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-        inputs_dict = prepare_pegasus_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def get_pipeline_config(self):
-        return PegasusConfig(
-            vocab_size=200,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=200,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def get_config(self):
-        return PegasusConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            forced_bos_token_id=self.forced_bos_token_id,
-            forced_eos_token_id=self.forced_eos_token_id,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = PegasusModel(config=config).get_decoder()
-        model.set_train(False)
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-        head_mask = inputs_dict["head_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = mindspore.ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = mindspore.ops.cat([attention_mask, next_attn_mask.astype(mindspore.bool_)], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = PegasusModel(config=config)
-        model.set_train(False)
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = PegasusEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
-            0
-        ]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = PegasusDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-@require_mindspore
-class PegasusModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (PegasusModel, PegasusForConditionalGeneration) if is_mindspore_available() else ()
-    all_generative_model_classes = (PegasusForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "conversational": PegasusForConditionalGeneration,
-            "feature-extraction": PegasusModel,
-            "summarization": PegasusForConditionalGeneration,
-            "text-generation": PegasusForCausalLM,
-            "text2text-generation": PegasusForConditionalGeneration,
-            "translation": PegasusForConditionalGeneration,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    fx_compatible = True
-    test_resize_position_embeddings = True
-    test_pruning = False
-    test_missing_keys = False
-
-    def setUp(self):
-        self.model_tester = PegasusModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=PegasusConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    @require_mindspore
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = PegasusForConditionalGeneration(config)
-        model.set_train(False)
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-
-def assert_tensors_close(a, b, atol=1e-12, prefix=""):
-    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if np.allclose(a.asnumpy(), b.asnumpy(), atol=atol):
-            return True
-        raise
-    except Exception:
-        pct_different = (mindspore.ops.gt((a - b).abs(), atol)).float().mean().item()
-        if a.numel() > 100:
-            msg = f"tensor values are {pct_different:.1%} percent different."
-        else:
-            msg = f"{a} != {b}"
-        if prefix:
-            msg = prefix + ": " + msg
-        raise AssertionError(msg)
-
-
-def _long_tensor(tok_lst):
-    long_tensor = tok_lst.copy().astype(mindspore.int64)
-    return long_tensor
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class PegasusXSUMIntegrationTest(AbstractSeq2SeqIntegrationTest):
-    checkpoint_name = "google/pegasus-xsum"
-    src_text = [
-        """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""",
-        """ The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a  re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """,
-    ]
-
-    tgt_text = [
-        "California's largest electricity provider has turned off power to hundreds of thousands of customers.",
-        "Pop group N-Dubz have revealed they were surprised to get four nominations for this year's Mobo Awards.",
-    ]
-
-    @cached_property
-    def model(self):
-        return AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name)
-
-    @slow
-    @require_mindspore
-    def test_pegasus_xsum_summary(self):
-        assert self.tokenizer.model_max_length == 512
-        inputs = self.tokenizer(self.src_text, return_tensors="ms", truncation=True, max_length=512, padding=True)
-        assert inputs.input_ids.shape == (2, 421)
-        translated_tokens = self.model.generate(**inputs, num_beams=2)
-        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
-        assert self.tgt_text == decoded
-
-        self.model.half()
-        translated_tokens_fp16 = self.model.generate(**inputs, max_length=10)
-        decoded_fp16 = self.tokenizer.batch_decode(translated_tokens_fp16, skip_special_tokens=True)
-        assert decoded_fp16 == [
-            "California's largest electricity provider has begun",
-            "N-Dubz have revealed they were",
-        ]
-
-
-class PegasusStandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        d_model=16,
-        decoder_seq_length=7,
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=2,
-        decoder_ffn_dim=32,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=False,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.is_encoder_decoder = is_encoder_decoder
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = PegasusConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_attention_heads=self.encoder_attention_heads,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        )
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = PegasusDecoder(config=config)
-        model.set_train(False)
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = mindspore.ops.cat([input_ids, next_tokens], axis=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3)
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        model = PegasusDecoder(config=config)
-        model.set_train(False)
-
-        # create attention mask
-        attn_mask = mindspore.ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = mindspore.ops.cat([input_ids, next_tokens], axis=-1)
-        attn_mask = mindspore.ops.cat(
-            [attn_mask, mindspore.ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            axis=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class PegasusStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (PegasusDecoder, PegasusForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (PegasusForCausalLM,) if is_mindspore_available() else ()
-    test_resize_position_embeddings = True
-    test_pruning = False
-    is_encoder_decoder = False
-
-    def setUp(
-        self,
-    ):
-        self.model_tester = PegasusStandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=PegasusConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_attn_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
-        return
diff --git a/tests/transformers/models/pegasus_x/__init__.py b/tests/transformers/models/pegasus_x/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/pegasus_x/test_modeling_pegasus_x.py b/tests/transformers/models/pegasus_x/test_modeling_pegasus_x.py
deleted file mode 100644
index 852980a33..000000000
--- a/tests/transformers/models/pegasus_x/test_modeling_pegasus_x.py
+++ /dev/null
@@ -1,853 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore PEGASUS-X model."""
-
-import copy
-import math
-import tempfile
-import unittest
-
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    is_mindspore_available,
-    require_mindspore,
-    slow,
-)
-from mindnlp.utils import cached_property
-from mindnlp.core import ops
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import PegasusTokenizer, PegasusXConfig, PegasusXForConditionalGeneration, PegasusXModel
-    from mindnlp.transformers.models.pegasus_x.modeling_pegasus_x import PegasusXDecoder, PegasusXEncoder
-
-
-def prepare_pegasus_x_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-    }
-
-@require_mindspore
-class PegasusXModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = PegasusXConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            stagger_local_blocks=False,
-        )
-        inputs_dict = prepare_pegasus_x_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = PegasusXModel(config=config).get_decoder().eval()
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask.astype(mindspore.bool_)], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = PegasusXModel(config=config).eval()
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = PegasusXEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
-            0
-        ]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = PegasusXDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-@require_mindspore
-class PegasusXModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (PegasusXModel, PegasusXForConditionalGeneration) if is_mindspore_available() else ()
-    all_generative_model_classes = (PegasusXForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": PegasusXModel,
-            "summarization": PegasusXForConditionalGeneration,
-            "text2text-generation": PegasusXForConditionalGeneration,
-            "translation": PegasusXForConditionalGeneration,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_pruning = False
-    test_head_masking = False
-    test_missing_keys = False
-
-    def setUp(self):
-        self.model_tester = PegasusXModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=PegasusXConfig)
-
-    @unittest.skip(
-        "`PegasusXGlobalLocalAttention` returns attentions as dictionary - not compatible with torchscript "
-    )
-    def test_torchscript_output_attentions(self):
-        pass
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (PegasusXModel, PegasusXForConditionalGeneration):
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            model(**inputs)[0]
-
-    @require_mindspore
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = PegasusXForConditionalGeneration(config).eval()
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0]["local"].shape[-4:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    math.ceil(encoder_seq_length / model.config.block_size),
-                    model.config.block_size,
-                    model.config.block_size + model.config.num_global_tokens,
-                ],
-            )
-            out_len = len(outputs)
-
-            if self.is_encoder_decoder:
-                correct_outlen = 5
-
-                # loss is at first position
-                if "labels" in inputs_dict:
-                    correct_outlen += 1  # loss is added to beginning
-                if "past_key_values" in outputs:
-                    correct_outlen += 1  # past_key_values have been returned
-
-                self.assertEqual(out_len, correct_outlen)
-
-                # decoder attentions
-                decoder_attentions = outputs.decoder_attentions
-                self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
-
-                # cross attentions
-                cross_attentions = outputs.cross_attentions
-                self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(cross_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        encoder_key_length,
-                    ],
-                )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0]["local"].shape[-4:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    math.ceil(encoder_seq_length / model.config.block_size),
-                    model.config.block_size,
-                    model.config.block_size + model.config.num_global_tokens,
-                ],
-            )
-
-    def _check_encoder_attention_for_generate(self, attentions, batch_size, config, seq_length):
-        encoder_expected_shape = (
-            batch_size,
-            config.num_attention_heads,
-            math.ceil(seq_length / config.block_size),
-            config.block_size,
-            config.block_size + config.num_global_tokens,
-        )
-        self.assertIsInstance(attentions, tuple)
-        self.assertListEqual(
-            [layer_attentions["local"].shape for layer_attentions in attentions],
-            [encoder_expected_shape] * len(attentions),
-        )
-
-    def _check_encoder_hidden_states_for_generate(self, hidden_states, batch_size, config, seq_length):
-        encoder_expected_shape = (batch_size, self.round_up(seq_length, config.block_size), config.hidden_size)
-        self.assertIsInstance(hidden_states, tuple)
-        # Only the last layer will have the hidden states truncated back to token level
-        self.assertListEqual(
-            [layer_hidden_states.shape for layer_hidden_states in hidden_states[:-1]],
-            [encoder_expected_shape] * (len(hidden_states) - 1),
-        )
-        # Only the last layer will have the hidden states truncated back to token level
-        self.assertEqual(
-            hidden_states[-1][0].shape,
-            (batch_size, seq_length, config.hidden_size),
-        )
-
-    def test_hidden_states_output(self):
-        def _check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
-                    seq_length = seq_length * self.model_tester.chunk_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.round_up(seq_length, config.block_size), self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            _check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            _check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = self.has_attentions
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        inputs = self._prepare_for_class(inputs_dict, model_class)
-
-        outputs = model(**inputs)
-
-        output = outputs[0]
-
-        if config.is_encoder_decoder:
-            # Seq2Seq models
-            encoder_hidden_states = outputs.encoder_hidden_states[0]
-
-            decoder_hidden_states = outputs.decoder_hidden_states[0]
-
-            if self.has_attentions:
-                encoder_attentions = outputs.encoder_attentions[0]
-
-                decoder_attentions = outputs.decoder_attentions[0]
-
-                cross_attentions = outputs.cross_attentions[0]
-
-            self.assertIsNotNone(encoder_hidden_states)
-            self.assertIsNotNone(decoder_hidden_states)
-
-            if self.has_attentions:
-                self.assertIsNotNone(encoder_attentions["local"])
-                self.assertIsNotNone(encoder_attentions["global"])
-                self.assertIsNotNone(decoder_attentions)
-                self.assertIsNotNone(cross_attentions)
-        else:
-            # Encoder-/Decoder-only models
-            hidden_states = outputs.hidden_states[0]
-
-            if self.has_attentions:
-                attentions = outputs.attentions[0]
-
-
-            self.assertIsNotNone(hidden_states)
-
-            if self.has_attentions:
-                self.assertIsNotNone(attentions)
-
-    @classmethod
-    def round_up(cls, n, k):
-        return math.ceil(n / k) * k
-
-
-
-def assert_tensors_close(a, b, atol=1e-12, prefix=""):
-    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if ops.allclose(a, b, atol=atol):
-            return True
-        raise
-    except Exception:
-        pct_different = (ops.gt((a - b).abs(), atol)).float().mean().item()
-        if a.numel() > 100:
-            msg = f"tensor values are {pct_different:.1%} percent different."
-        else:
-            msg = f"{a} != {b}"
-        if prefix:
-            msg = prefix + ": " + msg
-        raise AssertionError(msg)
-
-
-def _long_tensor(tok_lst):
-    return mindspore.tensor(tok_lst, dtype=mindspore.int64)
-
-
-TOLERANCE = 1e-4
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-@slow
-class PegasusXModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_tokenizer(self):
-        return PegasusTokenizer.from_pretrained("google/pegasus-x-base")
-
-    def test_inference_no_head(self):
-        model = PegasusXModel.from_pretrained("google/pegasus-x-base")
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        decoder_input_ids = _long_tensor([[2, 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588]])
-        inputs_dict = prepare_pegasus_x_inputs_dict(model.config, input_ids, decoder_input_ids)
-        output = model(**inputs_dict)[0]
-        expected_shape = (1, 11, 768)
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = mindspore.tensor(
-            [[0.0702, -0.1552, 0.1192], [0.0836, -0.1848, 0.1304], [0.0673, -0.1686, 0.1045]]
-        )
-
-        self.assertTrue(ops.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
-
-    def test_inference_head(self):
-        model = PegasusXForConditionalGeneration.from_pretrained("google/pegasus-x-base")
-
-        # change to intended input
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        inputs_dict = prepare_pegasus_x_inputs_dict(model.config, input_ids, decoder_input_ids)
-        output = model(**inputs_dict)[0]
-        expected_shape = (1, 11, model.config.vocab_size)
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = mindspore.tensor(
-            [[0.0, 9.5705185, 1.5897303], [0.0, 9.833374, 1.5828674], [0.0, 10.429961, 1.5643371]]
-        )
-        self.assertTrue(ops.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
-
-    # @unittest.skip(reason="Mindspore Topk bug")
-    def test_seq_to_seq_generation(self):
-        hf = PegasusXForConditionalGeneration.from_pretrained("google/pegasus-x-base-arxiv")
-        tok = PegasusTokenizer.from_pretrained("google/pegasus-x-base")
-
-        batch_input = [
-            "While large pretrained Transformer models have proven highly capable at tackling natural language tasks,"
-            " handling long sequence inputs continues to be a significant challenge. One such task is long input"
-            " summarization, where inputs are longer than the maximum input context of most pretrained models. Through"
-            " an extensive set of experiments, we investigate what model architectural changes and pretraining"
-            " paradigms can most efficiently adapt a pretrained Transformer for long input summarization. We find that"
-            " a staggered, block-local Transformer with global encoder tokens strikes a good balance of performance"
-            " and efficiency, and that an additional pretraining phase on long sequences meaningfully improves"
-            " downstream summarization performance. Based on our findings, we introduce PEGASUS-X, an extension of the"
-            " PEGASUS model with additional long input pretraining to handle inputs of up to 16K tokens. PEGASUS-X"
-            " achieves strong performance on long input summarization tasks comparable with much larger models while"
-            " adding few additional parameters and not requiring model parallelism to train."
-        ]
-
-        # The below article tests that we don't add any hypotheses outside of the top n_beams
-        dct = tok.batch_encode_plus(
-            batch_input,
-            max_length=512,
-            padding="max_length",
-            truncation_strategy="only_first",
-            truncation=True,
-            return_tensors="ms",
-        )
-
-        hypotheses_batch = hf.generate(
-            input_ids=dct["input_ids"],
-            attention_mask=dct["attention_mask"],
-            num_beams=2,
-            max_length=32,
-        )
-
-        EXPECTED = [
-            "we investigate the performance of a new pretrained model for long input summarization. <n> the model is a"
-            " superposition of two well -"
-        ]
-
-        generated = tok.batch_decode(
-            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )
-        assert generated == EXPECTED
-
-
-class PegasusXStandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        d_model=16,
-        decoder_seq_length=7,
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=2,
-        decoder_ffn_dim=32,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=False,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.is_encoder_decoder = is_encoder_decoder
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = PegasusXConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_attention_heads=self.encoder_attention_heads,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        )
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = PegasusXDecoder(config=config).eval()
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        model = PegasusXDecoder(config=config).eval()
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class PegasusXStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (PegasusXDecoder,) if is_mindspore_available() else ()
-    all_generative_model_classes = ()
-    test_pruning = False
-    is_encoder_decoder = False
-    test_head_masking = False
-
-    def setUp(
-        self,
-    ):
-        self.model_tester = PegasusXStandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=PegasusXConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_attn_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    @unittest.skip(reason="Decoder cannot keep gradients")
-    def test_retain_grad_hidden_states_attentions(self):
-        return
diff --git a/tests/transformers/models/perceiver/__init__.py b/tests/transformers/models/perceiver/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/perceiver/test_modeling_perceiver.py b/tests/transformers/models/perceiver/test_modeling_perceiver.py
deleted file mode 100644
index a34926efb..000000000
--- a/tests/transformers/models/perceiver/test_modeling_perceiver.py
+++ /dev/null
@@ -1,1004 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Perceiver model."""
-
-import copy
-import inspect
-import math
-import tempfile
-import unittest
-import warnings
-from typing import Dict, List, Tuple
-
-import numpy as np
-from datasets import load_dataset
-
-from mindnlp.transformers import PerceiverConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-    from mindnlp.engine import set_seed
-
-    from mindnlp.transformers import (
-        PerceiverForImageClassificationConvProcessing,
-        PerceiverForImageClassificationFourier,
-        PerceiverForImageClassificationLearned,
-        PerceiverForMaskedLM,
-        PerceiverForMultimodalAutoencoding,
-        PerceiverForOpticalFlow,
-        PerceiverForSequenceClassification,
-        PerceiverModel,
-        PerceiverTokenizer,
-    )
-    from mindnlp.transformers.models.auto.modeling_auto import (
-        MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-        MODEL_FOR_MASKED_LM_MAPPING_NAMES,
-        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
-        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
-        MODEL_MAPPING_NAMES,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import PerceiverImageProcessor
-
-
-class PerceiverModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        num_channels=3,
-        image_size=32,
-        train_size=[20, 20],
-        num_frames=5,
-        audio_samples_per_frame=200,
-        samples_per_patch=20,
-        nchunks=20,
-        num_latents=10,
-        d_latents=20,
-        d_model=64,
-        num_blocks=1,
-        num_self_attends_per_block=2,
-        num_self_attention_heads=1,
-        num_cross_attention_heads=1,
-        self_attention_widening_factor=4,
-        cross_attention_widening_factor=4,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_act="gelu",
-        attention_probs_dropout_prob=0.1,
-        initializer_range=0.02,
-        max_position_embeddings=7,
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.train_size = train_size
-        self.num_frames = num_frames
-        self.audio_samples_per_frame = audio_samples_per_frame
-        self.samples_per_patch = samples_per_patch
-        self.nchunks = nchunks
-        self.num_latents = num_latents
-        self.d_latents = d_latents
-        self.d_model = d_model
-        self.num_blocks = num_blocks
-        self.num_self_attends_per_block = num_self_attends_per_block
-        self.num_self_attention_heads = num_self_attention_heads
-        self.num_cross_attention_heads = num_cross_attention_heads
-        self.self_attention_widening_factor = self_attention_widening_factor
-        self.cross_attention_widening_factor = cross_attention_widening_factor
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_act = hidden_act
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.scope = scope
-        # set subsampling for multimodal model (take first chunk)
-        image_chunk_size = np.prod((self.num_frames, self.image_size, self.image_size)) // self.nchunks
-        audio_chunk_size = self.num_frames * self.audio_samples_per_frame // self.samples_per_patch // self.nchunks
-        self.subsampling = {
-            "image": ops.arange(0, int(image_chunk_size)),
-            "audio": ops.arange(0, audio_chunk_size),
-            "label": None,
-        }
-
-    def prepare_config_and_inputs(self, model_class=None):
-        config = self.get_config()
-
-        input_mask = None
-        sequence_labels = None
-        token_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.num_labels)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-
-        if model_class is None or model_class.__name__ == "PerceiverModel":
-            inputs = floats_tensor([self.batch_size, self.seq_length, config.d_model], scale=1.0)
-            return config, inputs, input_mask, sequence_labels, token_labels
-        elif model_class.__name__ in ["PerceiverForMaskedLM", "PerceiverForSequenceClassification"]:
-            inputs = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            # input mask is only relevant for text inputs
-            if self.use_input_mask:
-                input_mask = random_attention_mask([self.batch_size, self.seq_length])
-        elif model_class.__name__ == "PerceiverForImageClassificationLearned":
-            inputs = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        elif model_class.__name__ == "PerceiverForImageClassificationFourier":
-            inputs = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        elif model_class.__name__ == "PerceiverForImageClassificationConvProcessing":
-            inputs = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        elif model_class.__name__ == "PerceiverForOpticalFlow":
-            inputs = floats_tensor([self.batch_size, 2, 27, self.train_size[0], self.train_size[1]])
-        elif model_class.__name__ == "PerceiverForMultimodalAutoencoding":
-            images = ops.randn(
-                (self.batch_size, self.num_frames, self.num_channels, self.image_size, self.image_size),
-            )
-            audio = ops.randn(
-                (self.batch_size, self.num_frames * self.audio_samples_per_frame, 1)
-            )
-            inputs = {
-                "image": images,
-                "audio": audio,
-                "label": ops.zeros((self.batch_size, self.num_labels)),
-            }
-        else:
-            raise ValueError(f"Model class {model_class} not supported")
-
-        return config, inputs, input_mask, sequence_labels, token_labels
-
-    def get_config(self):
-        return PerceiverConfig(
-            num_latents=self.num_latents,
-            d_latents=self.d_latents,
-            d_model=self.d_model,
-            qk_channels=self.d_latents,
-            v_channels=self.d_latents,
-            num_blocks=self.num_blocks,
-            num_self_attends_per_block=self.num_self_attends_per_block,
-            num_self_attention_heads=self.num_self_attention_heads,
-            num_cross_attention_heads=self.num_cross_attention_heads,
-            self_attention_widening_factor=self.self_attention_widening_factor,
-            cross_attention_widening_factor=self.cross_attention_widening_factor,
-            vocab_size=self.vocab_size,
-            hidden_act=self.hidden_act,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-            max_position_embeddings=self.max_position_embeddings,
-            image_size=self.image_size,
-            train_size=self.train_size,
-            num_frames=self.num_frames,
-            audio_samples_per_frame=self.audio_samples_per_frame,
-            samples_per_patch=self.samples_per_patch,
-            num_labels=self.num_labels,
-            output_num_channels=32,
-            _label_trainable_num_channels=16,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        # Byte level vocab
-        config.vocab_size = 261
-        config.max_position_embeddings = 40
-        return config
-
-    def create_and_check_for_masked_lm(self, config, inputs, input_mask, sequence_labels, token_labels):
-        model = PerceiverForMaskedLM(config=config)
-        model.eval()
-        result = model(inputs, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_sequence_classification(self, config, inputs, input_mask, sequence_labels, token_labels):
-        model = PerceiverForSequenceClassification(config=config)
-        model.eval()
-        result = model(inputs, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_image_classification_learned(
-        self, config, inputs, input_mask, sequence_labels, token_labels
-    ):
-        model = PerceiverForImageClassificationLearned(config=config)
-        model.eval()
-        result = model(inputs, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_image_classification_fourier(
-        self, config, inputs, input_mask, sequence_labels, token_labels
-    ):
-        model = PerceiverForImageClassificationFourier(config=config)
-        model.eval()
-        result = model(inputs, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_image_classification_conv(
-        self, config, inputs, input_mask, sequence_labels, token_labels
-    ):
-        model = PerceiverForImageClassificationConvProcessing(config=config)
-        model.eval()
-        result = model(inputs, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, inputs, input_mask, sequence_labels, token_labels = config_and_inputs
-        inputs_dict = {"inputs": inputs, "attention_mask": input_mask}
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_model_class(self, model_class):
-        config_and_inputs = self.prepare_config_and_inputs(model_class)
-        config, inputs, input_mask, sequence_labels, token_labels = config_and_inputs
-        inputs_dict = {"inputs": inputs, "attention_mask": input_mask}
-
-        return config, inputs_dict
-
-
-@require_mindspore
-class PerceiverModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            PerceiverModel,
-            PerceiverForMaskedLM,
-            PerceiverForImageClassificationLearned,
-            PerceiverForImageClassificationConvProcessing,
-            PerceiverForImageClassificationFourier,
-            PerceiverForOpticalFlow,
-            PerceiverForMultimodalAutoencoding,
-            PerceiverForSequenceClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": PerceiverModel,
-            "fill-mask": PerceiverForMaskedLM,
-            "image-classification": (
-                PerceiverForImageClassificationConvProcessing,
-                PerceiverForImageClassificationFourier,
-                PerceiverForImageClassificationLearned,
-            ),
-            "text-classification": PerceiverForSequenceClassification,
-            "zero-shot": PerceiverForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_head_masking = False
-    test_torchscript = False
-
-    maxDiff = None
-
-    def setUp(self):
-        self.model_tester = PerceiverModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=PerceiverConfig,
-            hidden_size=37,
-            common_properties=["d_model", "num_self_attention_heads", "num_cross_attention_heads"],
-        )
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if model_class.__name__ == "PerceiverForMultimodalAutoencoding":
-            inputs_dict["subsampled_output_points"] = self.model_tester.subsampling
-
-        if return_labels:
-            if model_class.__name__ in [
-                *MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values(),
-                "PerceiverForImageClassificationLearned",
-                "PerceiverForImageClassificationFourier",
-                "PerceiverForImageClassificationConvProcessing",
-                *MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES.values(),
-            ]:
-                inputs_dict["labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-            elif model_class.__name__ in [
-                *MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES.values(),
-                *MODEL_FOR_MASKED_LM_MAPPING_NAMES.values(),
-            ]:
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int64
-                )
-        return inputs_dict
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(model_class=PerceiverForMaskedLM)
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(model_class=PerceiverForSequenceClassification)
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_image_classification_learned(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            model_class=PerceiverForImageClassificationLearned
-        )
-        self.model_tester.create_and_check_for_image_classification_learned(*config_and_inputs)
-
-    def test_for_image_classification_fourier(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            model_class=PerceiverForImageClassificationFourier
-        )
-        self.model_tester.create_and_check_for_image_classification_fourier(*config_and_inputs)
-
-    def test_for_image_classification_conv(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(
-            model_class=PerceiverForImageClassificationConvProcessing
-        )
-        self.model_tester.create_and_check_for_image_classification_conv(*config_and_inputs)
-
-    def test_model_get_set_embeddings(self):
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
-            model = model_class(config)
-            # we overwrite this, as the embeddings of Perceiver are an instance of nn.Parameter
-            # and Perceiver doesn't support get_output_embeddings
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Parameter))
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="model_tester.is_training is set to False")
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ in [
-                *MODEL_MAPPING_NAMES.values(),
-                "PerceiverForOpticalFlow",
-                "PerceiverForMultimodalAutoencoding",
-            ]:
-                continue
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
-            config.return_dict = True
-
-            model = model_class(config)
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            loss.backward()
-
-    def test_forward_signature(self):
-        for model_class in self.all_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
-
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["inputs"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_determinism(self):
-        set_seed(123)
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
-
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                first = model(**inputs_dict)[0]
-                second = model(**inputs_dict)[0]
-
-            if model_class.__name__ == "PerceiverForMultimodalAutoencoding":
-                # model outputs a dictionary with logits per modality, let's verify each modality
-                for modality in first.keys():
-                    out_1 = first[modality].asnumpy()
-                    out_2 = second[modality].asnumpy()
-                    out_1 = out_1[~np.isnan(out_1)]
-                    out_2 = out_2[~np.isnan(out_2)]
-                    max_diff = np.amax(np.abs(out_1 - out_2))
-                    self.assertLessEqual(max_diff, 1e-3)
-            else:
-                out_1 = first.asnumpy()
-                out_2 = second.asnumpy()
-                out_1 = out_1[~np.isnan(out_1)]
-                out_2 = out_2[~np.isnan(out_2)]
-                max_diff = np.amax(np.abs(out_1 - out_2))
-                self.assertLessEqual(max_diff, 1e-5)
-
-    def test_attention_outputs(self):
-        seq_len = getattr(self.model_tester, "num_latents", None)
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
-            config.return_dict = True
-
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            self_attentions = outputs.attentions
-            cross_attentions = outputs.cross_attentions
-
-            # check expected number of attentions depending on model class
-            expected_num_self_attentions = self.model_tester.num_blocks * self.model_tester.num_self_attends_per_block
-            if model.__class__.__name__ == "PerceiverModel":
-                # we expect to have 2 cross-attentions, namely one in the PerceiverEncoder, and one in PerceiverBasicDecoder
-                expected_num_cross_attentions = 1
-            else:
-                # we expect to have 2 cross-attentions, namely one in the PerceiverEncoder, and one in PerceiverBasicDecoder
-                expected_num_cross_attentions = 2
-            self.assertEqual(len(self_attentions), expected_num_self_attentions)
-            self.assertEqual(len(cross_attentions), expected_num_cross_attentions)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            self_attentions = outputs.attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertEqual(len(self_attentions), expected_num_self_attentions)
-            self.assertEqual(len(cross_attentions), expected_num_cross_attentions)
-
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_self_attention_heads, seq_len, seq_len],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            self.assertEqual(out_len + 1, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), expected_num_self_attentions)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_self_attention_heads, seq_len, seq_len],
-            )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_layers = self.model_tester.num_blocks * self.model_tester.num_self_attends_per_block + 1
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            seq_length = self.model_tester.num_latents
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.d_latents],
-            )
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
-
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    @unittest.skip('CPU cannot reach 1e-3 precision')
-    def test_batching_equivalence(self):
-        set_seed(123)
-        super().test_batching_equivalence()
-
-    def test_model_outputs_equivalence(self):
-        def set_nan_tensor_to_zero(t):
-            t[t != t] = 0
-            return t
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            set_seed(123)
-            with no_grad():
-                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
-                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-                def recursive_check(tuple_object, dict_object):
-                    if isinstance(tuple_object, (List, Tuple)):
-                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                            recursive_check(tuple_iterable_value, dict_iterable_value)
-                    elif isinstance(tuple_object, Dict):
-                        for tuple_iterable_value, dict_iterable_value in zip(
-                            tuple_object.values(), dict_object.values()
-                        ):
-                            recursive_check(tuple_iterable_value, dict_iterable_value)
-                    elif tuple_object is None:
-                        return
-                    else:
-                        self.assertTrue(
-                            ops.allclose(
-                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-2
-                            ),
-                            msg=(
-                                "Tuple and dict output are not equal. Difference:"
-                                f" {ops.max(ops.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                                f" {ops.isnan(tuple_object).any()} and `inf`: {ops.isinf(tuple_object)}. Dict has"
-                                f" `nan`: {ops.isnan(dict_object).any()} and `inf`: {ops.isinf(dict_object)}."
-                            ),
-                        )
-
-                recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            print(model_class)
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
-
-            model = model_class(config)
-            model.eval()
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            if model_class.__name__ not in ["PerceiverForOpticalFlow", "PerceiverForMultimodalAutoencoding"]:
-                # optical flow + multimodal models don't support training for now
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-            if model_class.__name__ not in ["PerceiverForOpticalFlow", "PerceiverForMultimodalAutoencoding"]:
-                # optical flow + multimodal models don't support training for now
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            if model_class.__name__ not in ["PerceiverForOpticalFlow", "PerceiverForMultimodalAutoencoding"]:
-                # optical flow + multimodal models don't support training for now
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-            if model_class.__name__ not in ["PerceiverForOpticalFlow", "PerceiverForMultimodalAutoencoding"]:
-                # optical flow + multimodal models don't support training for now
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(
-                    model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
-                )
-
-    def test_feed_forward_chunking(self):
-        for model_class in self.all_model_classes:
-            original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
-            set_seed(0)
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-            model.eval()
-
-            hidden_states_no_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            set_seed(0)
-            config.chunk_size_feed_forward = 1
-            model = model_class(config)
-            model.eval()
-
-            hidden_states_with_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-            if model_class.__name__ == "PerceiverForMultimodalAutoencoding":
-                # model outputs a dictionary with logits for each modality
-                for modality in hidden_states_no_chunk.keys():
-                    self.assertTrue(
-                        ops.allclose(hidden_states_no_chunk[modality], hidden_states_with_chunk[modality], atol=1e-3)
-                    )
-            else:
-                self.assertTrue(ops.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3))
-
-    def test_save_load(self):
-        for model_class in self.all_model_classes:
-            set_seed(123)
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
-
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if model_class.__name__ == "PerceiverForMultimodalAutoencoding":
-                for modality in outputs[0].keys():
-                    out_2 = outputs[0][modality].asnumpy()
-                    out_2[np.isnan(out_2)] = 0
-
-                    with tempfile.TemporaryDirectory() as tmpdirname:
-                        model.save_pretrained(tmpdirname)
-                        model = model_class.from_pretrained(tmpdirname)
-                        with no_grad():
-                            after_outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-                        # Make sure we don't have nans
-                        out_1 = after_outputs[0][modality].asnumpy()
-                        out_1[np.isnan(out_1)] = 0
-                        max_diff = np.amax(np.abs(out_1 - out_2))
-                        self.assertLessEqual(max_diff, 1e-3)
-
-            else:
-                out_2 = outputs[0].asnumpy()
-                out_2[np.isnan(out_2)] = 0
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    model.save_pretrained(tmpdirname)
-                    model = model_class.from_pretrained(tmpdirname)
-                    with no_grad():
-                        after_outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-                    # Make sure we don't have nans
-                    out_1 = after_outputs[0].asnumpy()
-                    out_1[np.isnan(out_1)] = 0
-                    max_diff = np.amax(np.abs(out_1 - out_2))
-                    self.assertLessEqual(max_diff, 1e-5)
-
-    def test_correct_missing_keys(self):
-        if not self.test_missing_keys:
-            self.skipTest(reason="test_missing_keys is set to False")
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            # most Perceiver models don't have a typical head like is the case with BERT
-            if model_class.__name__ in [
-                "PerceiverForOpticalFlow",
-                "PerceiverForMultimodalAutoencoding",
-                *MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values(),
-                "PerceiverForImageClassificationLearned",
-                "PerceiverForImageClassificationFourier",
-                "PerceiverForImageClassificationConvProcessing",
-                *MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES.values(),
-            ]:
-                continue
-
-            model = model_class(config)
-            base_model_prefix = model.base_model_prefix
-
-            if hasattr(model, base_model_prefix):
-                with tempfile.TemporaryDirectory() as temp_dir_name:
-                    model.base_model.save_pretrained(temp_dir_name)
-                    model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True)
-                    with self.subTest(msg=f"Missing keys for {model.__class__.__name__}"):
-                        self.assertGreater(len(loading_info["missing_keys"]), 0)
-
-    def test_problem_types(self):
-        problem_types = [
-            {"title": "multi_label_classification", "num_labels": 2, "dtype": mindspore.float32},
-            {"title": "single_label_classification", "num_labels": 1, "dtype": mindspore.int64},
-            {"title": "regression", "num_labels": 1, "dtype": mindspore.float32},
-        ]
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ not in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES.values():
-                continue
-
-            config, inputs, input_mask, _, _ = self.model_tester.prepare_config_and_inputs(model_class=model_class)
-            inputs_dict = {"inputs": inputs, "attention_mask": input_mask}
-
-            for problem_type in problem_types:
-                with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-                    config.problem_type = problem_type["title"]
-                    config.num_labels = problem_type["num_labels"]
-
-                    model = model_class(config)
-                    model.train()
-
-                    inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-                    if problem_type["num_labels"] > 1:
-                        inputs["labels"] = inputs["labels"].unsqueeze(1).tile((1, problem_type["num_labels"]))
-
-                    inputs["labels"] = inputs["labels"].to(problem_type["dtype"])
-
-                    # This tests that we do not trigger the warning form PyTorch "Using a target size that is different
-                    # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure
-                    # they have the same size." which is a symptom something in wrong for the regression problem.
-                    # See https://github.com/huggingface/transformers/issues/11780
-                    with warnings.catch_warnings(record=True) as warning_list:
-                        loss = model(**inputs).loss
-                    for w in warning_list:
-                        if "Using a target size that is different to the input size" in str(w.message):
-                            raise ValueError(
-                                f"Something is going wrong in the regression problem: intercepted {w.message}"
-                            )
-
-                    loss.backward()
-
-    @unittest.skip(
-        reason=(
-            "Perceiver does not work with data parallel (DP) because of a bug in PyTorch:"
-            " https://github.com/pytorch/pytorch/issues/36035"
-        )
-    )
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-    @unittest.skip(reason="Perceiver models don't have a typical head like is the case with BERT")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Perceiver models don't have a typical head like is the case with BERT")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(reason="Perceiver doesn't support resize_token_embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Perceiver doesn't support resize_token_embeddings")
-    def test_resize_embeddings_untied(self):
-        pass
-
-    @unittest.skip(reason="Perceiver doesn't support inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Perceiver doesn't support the AutoModel API")
-    def test_load_with_mismatched_shapes(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "deepmind/language-perceiver"
-        model = PerceiverModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-# Helper functions for optical flow integration test
-def prepare_optical_flow_images():
-    dataset = load_dataset("hf-internal-testing/fixtures_sintel", split="test", trust_remote_code=True)
-    image1 = Image.open(dataset[0]["file"]).convert("RGB")
-    image2 = Image.open(dataset[0]["file"]).convert("RGB")
-
-    return image1, image2
-
-
-def normalize(img):
-    return img / 255.0 * 2 - 1
-
-
-def extract_image_patches(x, kernel, stride=1, dilation=1):
-    # Do TF 'SAME' Padding
-    b, c, h, w = x.shape
-    h2 = math.ceil(h / stride)
-    w2 = math.ceil(w / stride)
-    pad_row = (h2 - 1) * stride + (kernel - 1) * dilation + 1 - h
-    pad_col = (w2 - 1) * stride + (kernel - 1) * dilation + 1 - w
-    x = nn.functional.pad(x, (pad_row // 2, pad_row - pad_row // 2, pad_col // 2, pad_col - pad_col // 2))
-
-    # Extract patches
-    patches = x.unfold(2, kernel, stride).unfold(3, kernel, stride)
-    patches = patches.permute(0, 4, 5, 1, 2, 3)
-
-    return patches.view(b, -1, patches.shape[-2], patches.shape[-1])
-
-
-@require_mindspore
-@require_vision
-class PerceiverModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        tokenizer = PerceiverTokenizer.from_pretrained("deepmind/language-perceiver")
-        model = PerceiverForMaskedLM.from_pretrained("deepmind/language-perceiver")
-
-        # prepare inputs
-        text = "This is an incomplete sentence where some words are missing."
-        encoding = tokenizer(text, padding="max_length", return_tensors="ms")
-
-        # mask " missing.".
-        encoding.input_ids[0, 52:61] = tokenizer.mask_token_id
-        inputs, input_mask = encoding.input_ids, encoding.attention_mask
-
-        # forward pass
-        with no_grad():
-            outputs = model(inputs=inputs, attention_mask=input_mask)
-        logits = outputs.logits
-
-        # verify logits
-        expected_shape = (1, tokenizer.model_max_length, len(tokenizer))
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[-10.8609, -10.7651, -10.9187], [-12.1689, -11.9389, -12.1479], [-12.1518, -11.9707, -12.2073]],
-        )
-
-        self.assertTrue(ops.allclose(logits[0, :3, :3], expected_slice, atol=1e-4))
-
-        expected_greedy_predictions = [38, 115, 111, 121, 121, 111, 116, 109, 52]
-        masked_tokens_predictions = logits[0, 52:61].argmax(dim=-1).tolist()
-        self.assertListEqual(expected_greedy_predictions, masked_tokens_predictions)
-
-    @slow
-    def test_inference_image_classification(self):
-        image_processor = PerceiverImageProcessor()
-        model = PerceiverForImageClassificationLearned.from_pretrained("deepmind/vision-perceiver-learned")
-
-        # prepare inputs
-        image = prepare_img()
-        inputs = image_processor(image, return_tensors="ms").pixel_values
-        input_mask = None
-
-        # forward pass
-        with no_grad():
-            outputs = model(inputs=inputs, attention_mask=input_mask)
-        logits = outputs.logits
-
-        # verify logits
-        expected_shape = (1, model.config.num_labels)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-1.1652, -0.1992, -0.7520])
-
-        atol = 1e-3
-        self.assertTrue(ops.allclose(logits[0, :3], expected_slice, atol=atol))
-
-    @slow
-    def test_inference_image_classification_fourier(self):
-        image_processor = PerceiverImageProcessor()
-        model = PerceiverForImageClassificationFourier.from_pretrained("deepmind/vision-perceiver-fourier")
-
-        # prepare inputs
-        image = prepare_img()
-        inputs = image_processor(image, return_tensors="ms").pixel_values
-        input_mask = None
-
-        # forward pass
-        with no_grad():
-            outputs = model(inputs=inputs, attention_mask=input_mask)
-        logits = outputs.logits
-
-        # verify logits
-        expected_shape = (1, model.config.num_labels)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-1.1295, -0.2832, 0.3226])
-
-        self.assertTrue(ops.allclose(logits[0, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_image_classification_conv(self):
-        image_processor = PerceiverImageProcessor()
-        model = PerceiverForImageClassificationConvProcessing.from_pretrained("deepmind/vision-perceiver-conv")
-
-        # prepare inputs
-        image = prepare_img()
-        inputs = image_processor(image, return_tensors="ms").pixel_values
-        input_mask = None
-
-        # forward pass
-        with no_grad():
-            outputs = model(inputs=inputs, attention_mask=input_mask)
-        logits = outputs.logits
-
-        # verify logits
-        expected_shape = (1, model.config.num_labels)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-1.1186, 0.0554, 0.0897])
-
-        self.assertTrue(ops.allclose(logits[0, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_optical_flow(self):
-        model = PerceiverForOpticalFlow.from_pretrained("deepmind/optical-flow-perceiver")
-
-        # prepare inputs
-        image1, image2 = prepare_optical_flow_images()
-        img1 = normalize(np.array(image1))
-        img2 = normalize(np.array(image1))
-
-        # stack images
-        img1 = mindspore.tensor(np.moveaxis(img1, -1, 0))
-        img2 = mindspore.tensor(np.moveaxis(img2, -1, 0))
-        images = ops.stack([img1, img2], dim=0)
-
-        # extract 3x3 patches
-        patch_size = model.config.train_size
-
-        inputs = images[..., : patch_size[0], : patch_size[1]].unsqueeze(0)
-        batch_size, _, C, H, W = inputs.shape
-        patches = extract_image_patches(inputs.view(batch_size * 2, C, H, W), kernel=3)
-        _, C, H, W = patches.shape
-        patches = patches.view(batch_size, -1, C, H, W).float()
-
-        # forward pass
-        with no_grad():
-            outputs = model(inputs=patches)
-        logits = outputs.logits
-
-        # verify logits
-        expected_shape = (1, 368, 496, 2)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [
-                [[0.0025, -0.0050], [0.0025, -0.0049], [0.0025, -0.0048]],
-                [[0.0026, -0.0049], [0.0026, -0.0048], [0.0026, -0.0047]],
-                [[0.0026, -0.0049], [0.0026, -0.0048], [0.0026, -0.0046]],
-            ],
-        )
-
-        self.assertTrue(ops.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_interpolate_pos_encoding(self):
-        image_processor = PerceiverImageProcessor(size={"height": 384, "width": 384})
-        model = PerceiverForImageClassificationLearned.from_pretrained("deepmind/vision-perceiver-learned")
-
-        # prepare inputs
-        image = prepare_img()
-        inputs = image_processor(image, return_tensors="ms").pixel_values
-        input_mask = None
-
-        # forward pass
-        with no_grad():
-            outputs = model(inputs=inputs, attention_mask=input_mask, interpolate_pos_encoding=True)
-        logits = outputs.logits
-
-        # verify logits
-        expected_shape = (1, model.config.num_labels)
-        self.assertEqual(logits.shape, expected_shape)
\ No newline at end of file
diff --git a/tests/transformers/models/perceiver/test_tokenization_perceiver.py b/tests/transformers/models/perceiver/test_tokenization_perceiver.py
deleted file mode 100644
index b25a27fb9..000000000
--- a/tests/transformers/models/perceiver/test_tokenization_perceiver.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import re
-import shutil
-import tempfile
-import unittest
-from typing import Tuple
-
-from mindnlp.transformers import PerceiverTokenizer
-from mindnlp.transformers.tokenization_utils import AddedToken, BatchEncoding
-from mindnlp.utils import cached_property, is_mindspore_available
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-if is_mindspore_available():
-    FRAMEWORK = "ms"
-else:
-    FRAMEWORK = "jax"
-
-
-class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "deepmind/language-perceiver"
-    tokenizer_class = PerceiverTokenizer
-    test_rust_tokenizer = False
-
-    def setUp(self):
-        super().setUp()
-        tokenizer = PerceiverTokenizer()
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    @cached_property
-    def perceiver_tokenizer(self):
-        return PerceiverTokenizer.from_pretrained("deepmind/language-perceiver")
-
-    def get_tokenizer(self, **kwargs) -> PerceiverTokenizer:
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
-        # XXX The default common tokenizer tests assume that every ID is decodable on its own.
-        # This assumption is invalid for Perceiver because single bytes might not be
-        # valid utf-8 (byte 128 for instance).
-        # Here we're overriding the smallest possible method to provide
-        # a clean sequence without making the same assumption.
-
-        toks = []
-        for i in range(len(tokenizer)):
-            try:
-                tok = tokenizer.decode([i], clean_up_tokenization_spaces=False)
-            except UnicodeDecodeError:
-                pass
-            toks.append((i, tok))
-
-        toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
-        toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks))
-        if max_length is not None and len(toks) > max_length:
-            toks = toks[:max_length]
-        if min_length is not None and min_length > len(toks) > 0:
-            while len(toks) < min_length:
-                toks = toks + toks
-        # toks_str = [t[1] for t in toks]
-        toks_ids = [t[0] for t in toks]
-
-        # Ensure consistency
-        output_txt = tokenizer.decode(toks_ids, clean_up_tokenization_spaces=False)
-        if " " not in output_txt and len(toks_ids) > 1:
-            output_txt = (
-                    tokenizer.decode([toks_ids[0]], clean_up_tokenization_spaces=False)
-                    + " "
-                    + tokenizer.decode(toks_ids[1:], clean_up_tokenization_spaces=False)
-            )
-        if with_prefix_space:
-            output_txt = " " + output_txt
-        output_ids = tokenizer.encode(output_txt, add_special_tokens=False)
-        return output_txt, output_ids
-
-    def test_multibytes_char(self):
-        tokenizer = self.perceiver_tokenizer
-        src_text = "Unicode €."
-        encoded = tokenizer(src_text)
-        encoded_ids = [4, 91, 116, 111, 105, 117, 106, 107, 38, 232, 136, 178, 52, 5]
-        self.assertEqual(encoded["input_ids"], encoded_ids)
-
-        # decoding
-        decoded = tokenizer.decode(encoded_ids)
-        self.assertEqual(decoded, "[CLS]Unicode €.[SEP]")
-
-        encoded = tokenizer("e è é ê ë")
-        encoded_ids = [4, 107, 38, 201, 174, 38, 201, 175, 38, 201, 176, 38, 201, 177, 5]
-        self.assertEqual(encoded["input_ids"], encoded_ids)
-        # decoding
-        decoded = tokenizer.decode(encoded_ids)
-        self.assertEqual(decoded, "[CLS]e è é ê ë[SEP]")
-
-        # encode/decode, but with `encode` instead of `__call__`
-        self.assertEqual(tokenizer.decode(tokenizer.encode("e è é ê ë")), "[CLS]e è é ê ë[SEP]")
-
-    def test_prepare_batch_integration(self):
-        tokenizer = self.perceiver_tokenizer
-        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
-        expected_src_tokens = [4, 71, 38, 114, 117, 116, 109, 38, 118, 103, 120, 103, 109, 120, 103, 118, 110, 38, 108,
-                               117, 120, 38, 121, 123, 115, 115, 103, 120, 111, 128, 103, 122, 111, 117, 116, 52, 5,
-                               0]  # fmt: skip
-        batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK)
-        self.assertIsInstance(batch, BatchEncoding)
-
-        if FRAMEWORK != "jax":
-            result = list(batch.input_ids.numpy()[0])
-        else:
-            result = list(batch.input_ids.tolist()[0])
-
-        self.assertListEqual(expected_src_tokens, result)
-
-        self.assertEqual((2, 38), batch.input_ids.shape)
-        self.assertEqual((2, 38), batch.attention_mask.shape)
-
-    def test_empty_target_text(self):
-        tokenizer = self.perceiver_tokenizer
-        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
-        batch = tokenizer(src_text, padding=True, return_tensors=FRAMEWORK)
-        # check if input_ids are returned and no decoder_input_ids
-        self.assertIn("input_ids", batch)
-        self.assertIn("attention_mask", batch)
-        self.assertNotIn("decoder_input_ids", batch)
-        self.assertNotIn("decoder_attention_mask", batch)
-
-    def test_max_length_integration(self):
-        tokenizer = self.perceiver_tokenizer
-        tgt_text = [
-            "Summary of the text.",
-            "Another summary.",
-        ]
-        targets = tokenizer(
-            text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK
-        )
-        self.assertEqual(32, targets["input_ids"].shape[1])
-
-    # cannot use default save_and_load_tokenizer test method because tokenizer has no vocab
-    def test_save_and_load_tokenizer(self):
-        # safety check on max_len default value so we are sure the test works
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                self.assertNotEqual(tokenizer.model_max_length, 42)
-
-        # Now let's start the test
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Isolate this from the other tests because we save additional tokens/etc
-                tmpdirname = tempfile.mkdtemp()
-
-                sample_text = " He is very happy, UNwant\u00e9d,running"
-                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
-                tokenizer.save_pretrained(tmpdirname)
-
-                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
-                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
-                self.assertListEqual(before_tokens, after_tokens)
-
-                shutil.rmtree(tmpdirname)
-
-        tokenizers = self.get_tokenizers(model_max_length=42)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Isolate this from the other tests because we save additional tokens/etc
-                tmpdirname = tempfile.mkdtemp()
-
-                sample_text = " He is very happy, UNwant\u00e9d,running"
-                tokenizer.add_tokens(["bim", "bambam"])
-                additional_special_tokens = tokenizer.additional_special_tokens
-                additional_special_tokens.append("new_additional_special_token")
-                tokenizer.add_special_tokens(
-                    {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
-                )
-                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
-                tokenizer.save_pretrained(tmpdirname)
-
-                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
-                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
-                self.assertListEqual(before_tokens, after_tokens)
-                self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens)
-                self.assertEqual(after_tokenizer.model_max_length, 42)
-
-                tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43)
-                self.assertEqual(tokenizer.model_max_length, 43)
-
-                shutil.rmtree(tmpdirname)
-
-    # There is a conflict between the default value of extra_ids and adding a new
-    # special token through additional_special_tokens
-    # We need to add the extra_ids in the list of the arg additional_special_tokens
-    def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
-        tokenizer_list = []
-        if self.test_slow_tokenizer:
-            tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
-
-        if self.test_rust_tokenizer:
-            tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
-
-        for tokenizer_class, tokenizer_utils in tokenizer_list:
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                tokenizer_utils.save_pretrained(tmp_dir)
-
-                with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
-                    special_tokens_map = json.load(json_file)
-
-                with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
-                    tokenizer_config = json.load(json_file)
-
-                added_tokens_extra_ids = [f"<extra_id_{i}>" for i in range(125)]
-
-                special_tokens_map["additional_special_tokens"] = added_tokens_extra_ids + [
-                    "an_additional_special_token"
-                ]
-                tokenizer_config["additional_special_tokens"] = added_tokens_extra_ids + [
-                    "an_additional_special_token"
-                ]
-
-                with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
-                    json.dump(special_tokens_map, outfile)
-                with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
-                    json.dump(tokenizer_config, outfile)
-
-                # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
-                # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
-                # "special_tokens_map.json" files
-                tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
-                    tmp_dir,
-                )
-                self.assertIn(
-                    "an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
-                )
-                self.assertEqual(
-                    ["an_additional_special_token"],
-                    tokenizer_without_change_in_init.convert_ids_to_tokens(
-                        tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"])
-                    ),
-                )
-
-                # Now we test that we can change the value of additional_special_tokens in the from_pretrained
-                new_added_tokens = added_tokens_extra_ids + [AddedToken("a_new_additional_special_token", lstrip=True)]
-                tokenizer = tokenizer_class.from_pretrained(
-                    tmp_dir,
-                    additional_special_tokens=new_added_tokens,
-                )
-
-                self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens)
-                self.assertEqual(
-                    ["a_new_additional_special_token"],
-                    tokenizer.convert_ids_to_tokens(
-                        tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"])
-                    ),
-                )
-
-    def test_decode_invalid_byte_id(self):
-        tokenizer = self.perceiver_tokenizer
-        self.assertEqual(tokenizer.decode([178]), "�")
-
-    @unittest.skip(reason="tokenizer does not have vocabulary")
-    def test_get_vocab(self):
-        pass
-
-    @unittest.skip(reason="inputs cannot be pretokenized")
-    def test_pretokenized_inputs(self):
-        # inputs cannot be pretokenized since ids depend on whole input string and not just on single characters
-        pass
-
-    @unittest.skip(reason="vocab does not exist")
-    def test_conversion_reversible(self):
-        pass
-
-    @unittest.skip(reason="no pretrained tokenizer for Perceiver model")
-    def test_pretrained_model_lists(self):
-        pass
-
-    def test_convert_tokens_to_string_format(self):
-        # The default common tokenizer tests uses invalid tokens for Perceiver that can only accept one-character
-        # strings and special added tokens as tokens
-        tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                tokens = ["[CLS]", "t", "h", "i", "s", " ", "i", "s", " ", "a", " ", "t", "e", "s", "t", "[SEP]"]
-                string = tokenizer.convert_tokens_to_string(tokens)
-
-                self.assertIsInstance(string, str)
diff --git a/tests/transformers/models/persimmon/__init__.py b/tests/transformers/models/persimmon/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/persimmon/test_modeling_persimmon.py b/tests/transformers/models/persimmon/test_modeling_persimmon.py
deleted file mode 100644
index 8eabdecfd..000000000
--- a/tests/transformers/models/persimmon/test_modeling_persimmon.py
+++ /dev/null
@@ -1,520 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Persimmon model."""
-
-import gc
-import unittest
-
-from parameterized import parameterized
-
-from mindnlp.transformers import PersimmonConfig
-from mindnlp.utils import is_mindspore_available
-from mindnlp.engine import set_seed
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        AutoTokenizer,
-        PersimmonForCausalLM,
-        PersimmonForSequenceClassification,
-        PersimmonForTokenClassification,
-        PersimmonModel,
-    )
-    from mindnlp.transformers.models.persimmon.modeling_persimmon import (
-        PersimmonDynamicNTKScalingRotaryEmbedding,
-        PersimmonLinearScalingRotaryEmbedding,
-        PersimmonRotaryEmbedding,
-    )
-
-
-# Copied from tests.models.llama.test_modeling_llama.LlamaModelTester with Llama->Persimmon
-class PersimmonModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ops.tril(ops.ones(self.batch_size, self.seq_length))
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return PersimmonConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = PersimmonModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = PersimmonModel(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = PersimmonForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = PersimmonForCausalLM(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class PersimmonModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (PersimmonModel, PersimmonForCausalLM, PersimmonForSequenceClassification, PersimmonForTokenClassification)
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": PersimmonModel,
-            "text-classification": PersimmonForSequenceClassification,
-            "token-classification": PersimmonForTokenClassification,
-            # TODO (ydshieh): check why these two fail. Fix them or skip them in a better way.
-            # "text-generation": PersimmonForCausalLM,
-            # "zero-shot": PersimmonForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    all_generative_model_classes = (PersimmonForCausalLM,) if is_mindspore_available() else ()
-    test_headmasking = False
-    test_pruning = False
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.setUp with Llama->Persimmon
-    def setUp(self):
-        self.model_tester = PersimmonModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=PersimmonConfig, hidden_size=37)
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_config
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model_various_embeddings
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model with Llama->Persimmon,llama->persimmon
-    def test_persimmon_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = PersimmonForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model_for_single_label with Llama->Persimmon,llama->persimmon
-    def test_persimmon_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = PersimmonForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model_for_multi_label with Llama->Persimmon,llama->persimmon
-    def test_persimmon_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(mindspore.float32)
-        model = PersimmonForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->Persimmon,llama->persimmon
-    def test_persimmon_token_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
-        model = PersimmonForTokenClassification(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
-        self.assertEqual(
-            result.logits.shape,
-            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
-        )
-
-    @unittest.skip(reason="Persimmon buffers include complex numbers, which breaks this test")
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_save_load_fast_init_from_base
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @parameterized.expand([("linear",), ("dynamic",)])
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model_rope_scaling_from_config with Llama->Persimmon
-    def test_model_rope_scaling_from_config(self, scaling_type):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        short_input = ids_tensor([1, 10], config.vocab_size)
-        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        original_model = PersimmonModel(config)
-        original_model.eval()
-        original_short_output = original_model(short_input).last_hidden_state
-        original_long_output = original_model(long_input).last_hidden_state
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
-        scaled_model = PersimmonModel(config)
-        scaled_model.eval()
-        scaled_short_output = scaled_model(short_input).last_hidden_state
-        scaled_long_output = scaled_model(long_input).last_hidden_state
-
-        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
-        # maximum sequence length, so the outputs for the short input should match.
-        if scaling_type == "dynamic":
-            self.assertTrue(ops.allclose(original_short_output, scaled_short_output, atol=1e-5))
-        else:
-            self.assertFalse(ops.allclose(original_short_output, scaled_short_output, atol=1e-5))
-
-        # The output should be different for long inputs
-        self.assertFalse(ops.allclose(original_long_output, scaled_long_output, atol=1e-5))
-
-    # Copied from tests.models.falcon.test_modeling_falcon.FalconModelTest.test_model_rope_scaling with Falcon->Persimmon
-    def test_model_rope_scaling(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        hidden_size = config.hidden_size
-        num_heads = config.num_attention_heads
-        head_dim = hidden_size // num_heads
-        scaling_factor = 10
-        short_input_length = 10
-        long_input_length = int(config.max_position_embeddings * 1.5)
-
-        # Inputs
-        x = ops.randn(1, dtype=mindspore.float32)  # used exlusively to get the dtype and the device
-
-        # Sanity check original RoPE
-        original_rope = PersimmonRotaryEmbedding(
-            head_dim,
-            max_position_embeddings=config.max_position_embeddings,
-            base=config.rope_theta,
-        )
-        original_cos_short, original_sin_short = original_rope(x, short_input_length)
-        original_cos_long, original_sin_long = original_rope(x, long_input_length)
-        assert ops.allclose(original_cos_short, original_cos_long[:short_input_length, :])
-        assert ops.allclose(original_sin_short, original_sin_long[:short_input_length, :])
-
-        # Sanity check linear RoPE scaling
-        # New position "x" should match original position with index "x/scaling_factor"
-        linear_scaling_rope = PersimmonLinearScalingRotaryEmbedding(
-            head_dim,
-            max_position_embeddings=config.max_position_embeddings,
-            base=config.rope_theta,
-            scaling_factor=scaling_factor,
-        )
-        linear_cos_short, linear_sin_short = linear_scaling_rope(x, short_input_length)
-        linear_cos_long, linear_sin_long = linear_scaling_rope(x, long_input_length)
-        assert ops.allclose(linear_cos_short, linear_cos_long[:short_input_length, :])
-        assert ops.allclose(linear_sin_short, linear_sin_long[:short_input_length, :])
-        for new_position in range(0, long_input_length, scaling_factor):
-            original_position = int(new_position // scaling_factor)
-            assert ops.allclose(linear_cos_long[new_position, :], original_cos_long[original_position, :])
-            assert ops.allclose(linear_sin_long[new_position, :], original_sin_long[original_position, :])
-
-        # Sanity check Dynamic NTK RoPE scaling
-        # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase
-        # with scaling_factor (or that `inv_freq` decreases)
-        ntk_scaling_rope = PersimmonDynamicNTKScalingRotaryEmbedding(
-            head_dim,
-            max_position_embeddings=config.max_position_embeddings,
-            base=config.rope_theta,
-            scaling_factor=scaling_factor,
-        )
-        ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, short_input_length)
-        ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, long_input_length)
-        assert ops.allclose(ntk_cos_short, original_cos_short)
-        assert ops.allclose(ntk_sin_short, original_sin_short)
-        with self.assertRaises(AssertionError):
-            assert ops.allclose(ntk_cos_long, original_cos_long)
-        with self.assertRaises(AssertionError):
-            assert ops.allclose(ntk_sin_long, original_sin_long)
-        self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
-
-
-# @require_mindspore
-# class PersimmonIntegrationTest(unittest.TestCase):
-    # @slow
-    # @require_mindspore_accelerator
-    # @require_bitsandbytes
-    # def test_model_8b_chat_logits(self):
-    #     input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
-    #     model = PersimmonForCausalLM.from_pretrained(
-    #         "adept/persimmon-8b-chat", load_in_8bit=True, device_map={"": 0}, torch_dtype=mindspore.float16
-    #     )
-    #     out = model(mindspore.tensor([input_ids])).logits
-
-    #     EXPECTED_MEAN = mindspore.tensor(
-    #         [[-11.4726, -11.1495, -11.2694, -11.2223, -10.9452, -11.0663, -11.0031, -11.1028]]
-    #     )
-    #     # change dtype to `mindspore.float32` before calling `mean` to avoid `nan` values
-    #     assert ops.allclose(out.to(mindspore.float32).mean(-1), EXPECTED_MEAN, atol=1e-4, rtol=1e-4)
-    #     # fmt: off
-    #     EXPECTED_SLICE = mindspore.tensor(
-    #         [-16.9062, -16.9062, -16.9062, -16.9062, -16.8906, -16.9062, -16.9531, -16.9062, -16.9062, -16.9062, -16.9531, -16.9062, -16.9531, -16.9062, -16.9062, -16.9062, -16.9062, -16.9062, -16.9531, -16.9062, -16.9062, -16.9062, -16.9062, -16.9062, -16.9062, -16.9531, -16.9062, -16.9531, -16.9062, -16.9062],
-    #         dtype=mindspore.float16
-    #     )
-    #     # fmt: on
-    #     assert ops.allclose(out[0, 0, :30], EXPECTED_SLICE, atol=1e-5, rtol=1e-5)
-
-    #     del model
-    #     gc.collect()
-
-    # @slow
-    # @require_mindspore_accelerator
-    # @require_mindspore_fp16
-    # @require_bitsandbytes
-    # def test_model_8b_chat_greedy_generation(self):
-    #     EXPECTED_TEXT_COMPLETION = """human: Simply put, the theory of relativity states that?\n\nadept: The theory of relativity states that the laws of physics are the same for all observers, regardless of their relative motion."""
-    #     prompt = "human: Simply put, the theory of relativity states that?\n\nadept:"
-    #     tokenizer = AutoTokenizer.from_pretrained("adept/persimmon-8b-chat", use_fast=False)
-    #     input_ids = tokenizer.encode(prompt, return_tensors="ms")
-    #     model = PersimmonForCausalLM.from_pretrained(
-    #         "adept/persimmon-8b-chat", load_in_8bit=True, device_map={"": 0}, torch_dtype=mindspore.float16
-    #     )
-
-    #     # greedy generation outputs
-    #     generated_ids = model.generate(input_ids, max_new_tokens=64)
-    #     text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-    #     self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
-
-    #     del model
-    #     gc.collect()
\ No newline at end of file
diff --git a/tests/transformers/models/phi/__init__.py b/tests/transformers/models/phi/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/phi/test_modeling_phi.py b/tests/transformers/models/phi/test_modeling_phi.py
deleted file mode 100644
index cc3cb0f3d..000000000
--- a/tests/transformers/models/phi/test_modeling_phi.py
+++ /dev/null
@@ -1,451 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Microsoft and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-""" Testing suite for the PyTorch Phi model. """
-
-
-import unittest
-import numpy as np
-
-from mindnlp.transformers import PhiConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-    is_mindspore_available
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        AutoTokenizer,
-        PhiForCausalLM,
-        PhiForSequenceClassification,
-        PhiForTokenClassification,
-        PhiModel,
-    )
-
-
-class PhiModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return PhiConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = PhiModel(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = PhiModel(config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = PhiForCausalLM(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = PhiForCausalLM(config=config)
-
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class PhiModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (PhiModel, PhiForCausalLM, PhiForSequenceClassification, PhiForTokenClassification)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (PhiForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": PhiModel,
-            "text-classification": PhiForSequenceClassification,
-            "text-generation": PhiForCausalLM,
-            "token-classification": PhiForTokenClassification,
-            "zero-shot": PhiForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_headmasking = False
-    test_pruning = False
-
-    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79292/workflows/fa2ba644-8953-44a6-8f67-ccd69ca6a476/jobs/1012905
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return True
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.setUp with Llama->Phi
-    def setUp(self):
-        self.model_tester = PhiModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=PhiConfig, hidden_size=37)
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_config
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model with Llama->Phi,llama->phi
-    def test_phi_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = PhiForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model_for_single_label with Llama->Phi,llama->phi
-    def test_phi_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = PhiForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model_for_multi_label with Llama->Phi,llama->phi
-    def test_phi_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(mindspore.float32)
-        model = PhiForSequenceClassification(config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @slow
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_flash_attn_2_generate_padding_right with LlamaForCausalLM->PhiForCausalLM,LlamaTokenizer->AutoTokenizer,meta-llama/Llama-2-7b-hf->microsoft/phi-1
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        model = PhiForCausalLM.from_pretrained(
-            "microsoft/phi-1"
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-
-        tokenizer.padding_side = "right"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="ms", padding=True)
-
-        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_native = tokenizer.batch_decode(output_native)
-
-
-@slow
-@require_mindspore
-class PhiIntegrationTest(unittest.TestCase):
-    def test_model_phi_1_logits(self):
-        input_ids = {
-            "input_ids": mindspore.tensor(
-                [[1212, 318, 281, 1672, 2643, 290, 428, 318, 257, 1332]], dtype=mindspore.int64
-            )
-        }
-
-        model = PhiForCausalLM.from_pretrained("microsoft/phi-1")
-        model.set_train(False)
-
-        output = model(**input_ids).logits
-
-        EXPECTED_OUTPUT = mindspore.tensor([[2.2671,  6.7684, -2.0107, -1.2440, -1.5335, -2.3828,  6.9186,  6.4245, 3.1548,  0.9998,  0.0760,  4.4653,  4.9857,  4.2956,  1.2308, -1.4178, 0.1361,  0.5191, -0.5699, -2.2201, -3.0750, -3.9600, -4.5936, -3.7394, -2.7777,  6.1874, -0.4148, -1.5684, -0.5967,  0.2395], [1.7004,  4.0383,  0.0546,  0.4530, -0.3619, -0.9021,  1.8355,  1.3587, 1.2406,  2.5775, -0.8834,  5.1910,  4.2565,  4.1406,  3.0752, -0.9099, 1.1595,  0.0264,  0.3243, -1.1803, -1.3945, -2.1406, -3.9939, -1.4438, -2.9546,  3.9204,  1.0851, -1.0598, -1.7819, -0.4827]])  # fmt: skip
-
-        self.assertTrue(np.allclose(EXPECTED_OUTPUT.asnumpy(), output[0, :2, :30].asnumpy(), atol=1e-4, rtol=1e-4))
-
-    def test_model_phi_1_5_logits(self):
-        input_ids = {
-            "input_ids": mindspore.tensor(
-                [[1212, 318, 281, 1672, 2643, 290, 428, 318, 257, 1332]], dtype=mindspore.int64
-            )
-        }
-
-        model = PhiForCausalLM.from_pretrained("microsoft/phi-1_5")
-        model.set_train(False)
-
-        output = model(**input_ids).logits
-
-        EXPECTED_OUTPUT = mindspore.tensor([[12.2922, 13.3507,  8.6963,  9.1355,  9.3502,  9.2667, 14.2027, 13.1363, 13.5446, 11.1337,  9.9279, 16.7195, 13.0768, 14.9141, 11.9965,  8.0233, 10.3129, 10.6118, 10.0204,  9.3827,  8.8344,  8.2806,  8.0153,  8.0540, 7.0964, 16.5743, 11.1256,  9.6987, 11.4770, 10.5440], [12.3323, 14.6050,  8.9986,  8.1580,  9.5654,  6.6728, 12.5966, 12.6662, 12.2784, 11.7522,  8.2039, 16.3102, 11.2203, 13.6088, 12.0125,  9.1021, 9.8216, 10.0987,  9.0926,  8.4260,  8.8009,  7.6547,  6.8075,  7.7881, 7.4501, 15.7451, 10.5053,  8.3129, 10.0027,  9.2612]])  # fmt: skip
-
-        self.assertTrue(np.allclose(EXPECTED_OUTPUT.asnumpy(), output[0, :2, :30].asnumpy(), atol=1e-4, rtol=1e-4))
-
-    def test_model_phi_2_logits(self):
-        input_ids = {
-            "input_ids": mindspore.tensor(
-                [[1212, 318, 281, 1672, 2643, 290, 428, 318, 257, 1332]], dtype=mindspore.int64
-            )
-        }
-
-        model = PhiForCausalLM.from_pretrained("microsoft/phi-2")
-        model.set_train(False)
-
-        output = model(**input_ids).logits
-
-        EXPECTED_OUTPUT = mindspore.tensor([[6.4830,  6.1644,  3.4055,  2.2848,  5.4654,  2.8360,  5.5975,  5.5391, 7.3101,  4.2498,  2.5913, 10.3885,  6.4359,  8.7982,  5.6534,  0.5150, 2.7498,  3.1930,  2.4334,  1.7781,  1.5613,  1.3067,  0.8291,  0.5633, 0.6522,  9.8191,  5.5771,  2.7987,  4.2845,  3.7030], [6.0642,  7.8242,  3.4634,  1.9259,  4.3169,  2.0913,  6.0446,  3.6804, 6.6736,  4.0727,  2.1791, 11.4139,  5.6795,  7.5652,  6.2039,  2.7174, 4.3266,  3.6930,  2.8058,  2.6721,  2.3047,  2.0848,  2.0972,  2.0441, 1.3160,  9.2085,  4.5557,  3.0296,  2.6045,  2.4059]])  # fmt: skip
-        print(output[0, :2, :30])
-
-        self.assertTrue(np.allclose(EXPECTED_OUTPUT.asnumpy(), output[0, :2, :30].asnumpy(), atol=1e-2, rtol=1e-2))
-
-    def test_phi_2_generation(self):
-        model = PhiForCausalLM.from_pretrained("microsoft/phi-2")
-        tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
-
-        inputs = tokenizer(
-            "Can you help me write a formal email to a potential business partner proposing a joint venture?",
-            return_tensors="ms",
-            return_attention_mask=False,
-        )
-
-        outputs = model.generate(**inputs, max_new_tokens=30)
-        output_text = tokenizer.batch_decode(outputs)
-
-        EXPECTED_OUTPUT = [
-            "Can you help me write a formal email to a potential business partner proposing a joint venture?\nInput: Company A: ABC Inc.\nCompany B: XYZ Ltd.\nJoint Venture: A new online platform for e-commerce"
-        ]
-
-        self.assertListEqual(output_text, EXPECTED_OUTPUT)
diff --git a/tests/transformers/models/phi3/__init__.py b/tests/transformers/models/phi3/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/phi3/test_modeling_phi3.py b/tests/transformers/models/phi3/test_modeling_phi3.py
deleted file mode 100644
index e83b67b35..000000000
--- a/tests/transformers/models/phi3/test_modeling_phi3.py
+++ /dev/null
@@ -1,562 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Testing suite for the PyTorch Phi-3 model."""
-
-import unittest
-from typing import List
-
-from parameterized import parameterized
-
-from mindnlp.transformers import Phi3Config, StaticCache
-from mindnlp.utils import is_mindspore_available
-from mindnlp.engine import set_seed
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, nn
-
-    from mindnlp.transformers import (
-        AutoTokenizer,
-        Phi3ForCausalLM,
-        Phi3ForSequenceClassification,
-        Phi3ForTokenClassification,
-        Phi3Model,
-    )
-
-    end_of_text_token = 32000
-
-    class Phi3MiniWithStaticCache(nn.Module):
-        def __init__(self, model: Phi3ForCausalLM, batch_size: int, max_seq_len: int):
-            super().__init__()
-            self.model = model
-            self.cache = StaticCache(
-                config=model.config,
-                batch_size=batch_size,
-                max_cache_len=max_seq_len,
-                device=self.model.device,
-                dtype=self.model.dtype,
-            )
-
-        def forward(
-            self,
-            input_ids: mindspore.Tensor = None,
-        ) -> mindspore.Tensor:
-            return self.model.forward(
-                input_ids=input_ids,
-                use_cache=True,
-                return_dict=True,
-                past_key_values=self.cache,
-            ).logits
-
-        @staticmethod
-        def generate(model: Phi3ForCausalLM, prompt_tokens: mindspore.Tensor, max_seq_len: int) -> List[int]:
-            model = Phi3MiniWithStaticCache(model, 1, max_seq_len + prompt_tokens.shape[-1])
-
-            response_tokens = []
-
-            for input_pos in range(prompt_tokens.shape[-1]):
-                result = model.forward(
-                    input_ids=prompt_tokens[:, input_pos : input_pos + 1],
-                )
-                response_tokens.append(prompt_tokens[0][input_pos].item())
-
-            current_token = ops.argmax(result[:, -1, :], dim=-1).item()
-            response_tokens.append(current_token)
-
-            while current_token != end_of_text_token and len(response_tokens) < max_seq_len:
-                result = model.forward(
-                    input_ids=mindspore.tensor([[current_token]], dtype=mindspore.int64),
-                )
-                current_token = ops.argmax(result[:, -1, :], dim=-1).item()
-                response_tokens.append(current_token)
-
-            return response_tokens
-
-
-class Phi3ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ops.tril(ops.ones(self.batch_size, self.seq_length))
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return Phi3Config(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Phi3
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Phi3Model(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Phi3
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Phi3Model(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Phi3
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Phi3ForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Phi3
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Phi3ForCausalLM(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class Phi3ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (Phi3Model, Phi3ForCausalLM, Phi3ForSequenceClassification, Phi3ForTokenClassification)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (Phi3ForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": Phi3Model,
-            "text-classification": Phi3ForSequenceClassification,
-            "text-generation": Phi3ForCausalLM,
-            "token-classification": Phi3ForTokenClassification,
-            "zero-shot": Phi3ForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_headmasking = False
-    test_pruning = False
-
-    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79292/workflows/fa2ba644-8953-44a6-8f67-ccd69ca6a476/jobs/1012905
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return True
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.setUp with Llama->Phi3
-    def setUp(self):
-        self.model_tester = Phi3ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Phi3Config, hidden_size=37)
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_config
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model with Llama->Phi3,llama->phi3
-    def test_phi3_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = Phi3ForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model_for_single_label with Llama->Phi3,llama->phi3
-    def test_phi3_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = Phi3ForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_sequence_classification_model_for_multi_label with Llama->Phi3,llama->phi3
-    def test_phi3_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(mindspore.float32)
-        model = Phi3ForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    @parameterized.expand([("longrope",)])
-    def test_model_rope_scaling_from_config(self, scaling_type):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        short_input = ids_tensor([1, 10], config.vocab_size)
-        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        original_model = Phi3Model(config)
-        original_model.eval()
-        original_short_output = original_model(short_input).last_hidden_state
-        original_long_output = original_model(long_input).last_hidden_state
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        n_factors = config.hidden_size // config.num_attention_heads // 2
-        config.rope_scaling = {
-            "type": scaling_type,
-            "short_factor": [5.0 for _ in range(n_factors)],
-            "long_factor": [5.0 for _ in range(n_factors)],
-        }
-        scaled_model = Phi3Model(config)
-        scaled_model.eval()
-        scaled_short_output = scaled_model(short_input).last_hidden_state
-        scaled_long_output = scaled_model(long_input).last_hidden_state
-
-        # Scaling changes the RoPE embeddings, both for the short and long outputs
-        self.assertFalse(ops.allclose(original_short_output, scaled_short_output, atol=1e-5))
-        self.assertFalse(ops.allclose(original_long_output, scaled_long_output, atol=1e-5))
-
-
-@slow
-@require_mindspore
-class Phi3IntegrationTest(unittest.TestCase):
-    def test_model_phi3_mini_4k_instruct_logits(self):
-        input_ids = {
-            "input_ids": mindspore.tensor(
-                [[1212, 318, 281, 1672, 2643, 290, 428, 318, 257, 1332]], dtype=mindspore.int64
-            )
-        }
-
-        model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
-        model.eval()
-
-        output = model(**input_ids).logits
-
-        EXPECTED_OUTPUT = mindspore.tensor([[ 0.9979, -1.9449, -2.5613, -2.2110, -0.9323, -2.2726, -3.2468, -2.0122,-1.0021, -1.2764, -1.0876, -1.2358,  3.9385,  6.2152, -0.3695, -2.3285,-1.2907, -1.8238, -1.9941, -2.2098, -0.6923, -1.6793, -1.1660, -2.0469,-0.7369, -1.4101, -1.4091, -3.1694, -1.8383, -1.1952],[ 3.0525,  1.9178,  3.7016,  0.9263,  0.3397,  1.9584,  2.1347,  0.3482, 1.3773,  0.2153,  0.2798,  0.8360,  9.0936, 11.4944, -0.3575, -0.9442,-0.1246,  1.3869,  0.9846,  1.7243,  0.9150,  1.0823,  0.4313,  1.5742, 0.2566, -0.1401, -1.3019,  0.4967,  0.6941,  0.7214]])  # fmt: skip
-
-        self.assertTrue(ops.allclose(EXPECTED_OUTPUT, output[0, :2, :30], atol=1e-4, rtol=1e-4))
-
-    def test_phi3_mini_4k_instruct_generation(self):
-        model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
-        tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
-
-        messages = [
-            {
-                "role": "system",
-                "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.",
-            },
-            {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
-        ]
-        inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="ms")
-
-        outputs = model.generate(inputs, max_new_tokens=32)
-        output_text = tokenizer.batch_decode(outputs)
-
-        EXPECTED_OUTPUT = [
-            "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some ideas for incorporating these fruits into your"
-        ]
-
-        self.assertListEqual(output_text, EXPECTED_OUTPUT)
-
-    def test_phi3_mini_4k_instruct_with_static_cache(self):
-        model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
-        tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
-
-        messages = [
-            {
-                "role": "system",
-                "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.",
-            },
-            {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
-        ]
-        inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="ms")
-
-        response_tokens = Phi3MiniWithStaticCache.generate(model, inputs, 64)
-
-        output_text = tokenizer.batch_decode(mindspore.tensor([response_tokens], dtype=mindspore.int64))
-
-        EXPECTED_OUTPUT = [
-            "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some"
-        ]
-
-        self.assertListEqual(output_text, EXPECTED_OUTPUT)
-
-    def test_model_phi3_mini_128k_instruct_logits(self):
-        input_ids = {
-            "input_ids": mindspore.tensor(
-                [[1212, 318, 281, 1672, 2643, 290, 428, 318, 257, 1332]], dtype=mindspore.int64
-            )
-        }
-
-        model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-128k-instruct")
-        model.eval()
-
-        output = model(**input_ids).logits
-
-        EXPECTED_OUTPUT = mindspore.tensor([[ 1.8478, -0.5709, -1.6792, -1.2133, -0.7809, -0.8817, -2.0969, -1.1191,-0.7731, -1.0483, -0.5961, -1.3067,  3.1325,  6.9442, -0.4803, -0.9154,-1.3085, -1.0822, -1.1433, -0.7660, -0.8531, -0.9150, -0.6179, -1.6153,-0.2239, -1.3207, -1.1187, -2.4795, -1.4733, -0.4931],[ 3.5839,  2.4722,  3.7130,  1.2032,  0.7356,  2.7777,  2.5256,  0.9157, 1.6431,  0.3533,  0.5100,  1.3512,  8.9873, 10.9815,  0.3530,  0.1473, 0.2051,  1.8553,  1.5988,  2.2268,  1.1897,  1.2829,  0.7894,  1.8895, 0.7666,  0.4122, -0.9316,  0.9936,  1.2722,  0.8263]])  # fmt: skip
-
-        self.assertTrue(ops.allclose(EXPECTED_OUTPUT, output[0, :2, :30], atol=1e-4, rtol=1e-4))
-
-    def test_phi3_mini_128k_instruct_generation(self):
-        model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-128k-instruct")
-        tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-128k-instruct")
-
-        messages = [
-            {
-                "role": "system",
-                "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.",
-            },
-            {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
-        ]
-        inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="ms")
-
-        outputs = model.generate(inputs, max_new_tokens=32)
-        output_text = tokenizer.batch_decode(outputs)
-
-        EXPECTED_OUTPUT = [
-            "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits can be combined in various delicious and nutritious ways. Here are some creative and healthy"
-        ]
-
-        self.assertListEqual(output_text, EXPECTED_OUTPUT)
-
-    def test_phi3_mini_128k_instruct_with_static_cache(self):
-        model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-128k-instruct")
-        tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-128k-instruct")
-
-        messages = [
-            {
-                "role": "system",
-                "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.",
-            },
-            {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
-        ]
-        inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="ms")
-
-        response_tokens = Phi3MiniWithStaticCache.generate(model, inputs, 64)
-
-        output_text = tokenizer.batch_decode(mindspore.tensor([response_tokens], dtype=mindspore.int64))
-
-        EXPECTED_OUTPUT = [
-            "<|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits can be combined in various delicious and nutritious ways"
-        ]
-
-        self.assertListEqual(output_text, EXPECTED_OUTPUT)
\ No newline at end of file
diff --git a/tests/transformers/models/pix2struct/__init__.py b/tests/transformers/models/pix2struct/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/pix2struct/test_image_processing_pix2struct.py b/tests/transformers/models/pix2struct/test_image_processing_pix2struct.py
deleted file mode 100644
index 9646c6032..000000000
--- a/tests/transformers/models/pix2struct/test_image_processing_pix2struct.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import requests
-import numpy as np
-from huggingface_hub import hf_hub_download
-
-from mindnlp.core import ops
-from mindnlp.utils import is_vision_available
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available, require_vision
-
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_mindspore_available():
-    import mindspore
-
-if is_vision_available():
-    from PIL import Image
-    from mindnlp.transformers import Pix2StructImageProcessor
-
-
-class Pix2StructImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        size=None,
-        do_normalize=True,
-        do_convert_rgb=True,
-        patch_size=None,
-    ):
-        size = size if size is not None else {"height": 20, "width": 20}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.size = size
-        self.do_normalize = do_normalize
-        self.do_convert_rgb = do_convert_rgb
-        self.max_patches = [512, 1024, 2048, 4096]
-        self.patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
-
-    def prepare_image_processor_dict(self):
-        return {"do_normalize": self.do_normalize, "do_convert_rgb": self.do_convert_rgb}
-
-    def prepare_dummy_image(self):
-        img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
-        raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
-        return raw_image
-        
-    
-
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class Pix2StructImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = Pix2StructImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = Pix2StructImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processor, "do_normalize"))
-        self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
-
-    def test_expected_patches(self):
-        dummy_image = self.image_processor_tester.prepare_dummy_image()
-
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        max_patch = 2048
-
-        inputs = image_processor(dummy_image, return_tensors="ms", max_patches=max_patch)
-        self.assertTrue(ops.allclose(inputs.flattened_patches.mean(), mindspore.tensor(0.0606), atol=1e-3, rtol=1e-3))
-
-    def test_call_pil(self):
-        # Initialize image_processor
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        expected_hidden_dim = (
-            (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
-            * self.image_processor_tester.num_channels
-        ) + 2
-
-        for max_patch in self.image_processor_tester.max_patches:
-            # Test not batched input
-            encoded_images = image_processor(
-                image_inputs[0], return_tensors="ms", max_patches=max_patch
-            ).flattened_patches
-            self.assertEqual(
-                encoded_images.shape,
-                (1, max_patch, expected_hidden_dim),
-            )
-
-            # Test batched
-            encoded_images = image_processor(
-                image_inputs, return_tensors="ms", max_patches=max_patch
-            ).flattened_patches
-            self.assertEqual(
-                encoded_images.shape,
-                (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
-            )
-
-    def test_call_vqa(self):
-        # Initialize image_processor
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        expected_hidden_dim = (
-            (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
-            * self.image_processor_tester.num_channels
-        ) + 2
-
-        image_processor.is_vqa = True
-
-        for max_patch in self.image_processor_tester.max_patches:
-            # Test not batched input
-            with self.assertRaises(ValueError):
-                encoded_images = image_processor(
-                    image_inputs[0], return_tensors="ms", max_patches=max_patch
-                ).flattened_patches
-
-            dummy_text = "Hello"
-
-            encoded_images = image_processor(
-                image_inputs[0], return_tensors="ms", max_patches=max_patch, header_text=dummy_text
-            ).flattened_patches
-            self.assertEqual(
-                encoded_images.shape,
-                (1, max_patch, expected_hidden_dim),
-            )
-
-            # Test batched
-            encoded_images = image_processor(
-                image_inputs, return_tensors="ms", max_patches=max_patch, header_text=dummy_text
-            ).flattened_patches
-            self.assertEqual(
-                encoded_images.shape,
-                (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
-            )
-
-    def test_call_numpy(self):
-        # Initialize image_processor
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        expected_hidden_dim = (
-            (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
-            * self.image_processor_tester.num_channels
-        ) + 2
-
-        for max_patch in self.image_processor_tester.max_patches:
-            # Test not batched input
-            encoded_images = image_processor(
-                image_inputs[0], return_tensors="ms", max_patches=max_patch
-            ).flattened_patches
-            self.assertEqual(
-                encoded_images.shape,
-                (1, max_patch, expected_hidden_dim),
-            )
-
-            # Test batched
-            encoded_images = image_processor(
-                image_inputs, return_tensors="ms", max_patches=max_patch
-            ).flattened_patches
-            self.assertEqual(
-                encoded_images.shape,
-                (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
-            )
-
-    def test_call_numpy_4_channels(self):
-        # Initialize image_processor
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        self.image_processor_tester.num_channels = 4
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        expected_hidden_dim = (
-            (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
-            * self.image_processor_tester.num_channels
-        ) + 2
-
-        for max_patch in self.image_processor_tester.max_patches:
-            # Test not batched input
-            encoded_images = image_processor(
-                image_inputs[0], return_tensors="ms", max_patches=max_patch, input_data_format="channels_first"
-            ).flattened_patches
-            self.assertEqual(
-                encoded_images.shape,
-                (1, max_patch, expected_hidden_dim),
-            )
-
-            # Test batched
-            encoded_images = image_processor(
-                image_inputs, return_tensors="ms", max_patches=max_patch, input_data_format="channels_first"
-            ).flattened_patches
-            self.assertEqual(
-                encoded_images.shape,
-                (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
-            )
-        self.image_processor_tester.num_channels = 3
-
-    def test_call_pytorch(self):
-        # Initialize image_processor
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, mindspore.Tensor)
-
-        # Test not batched input
-        expected_hidden_dim = (
-            (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
-            * self.image_processor_tester.num_channels
-        ) + 2
-
-        for max_patch in self.image_processor_tester.max_patches:
-            # Test not batched input
-            encoded_images = image_processor(
-                image_inputs[0], return_tensors="ms", max_patches=max_patch
-            ).flattened_patches
-            self.assertEqual(
-                encoded_images.shape,
-                (1, max_patch, expected_hidden_dim),
-            )
-
-            # Test batched
-            encoded_images = image_processor(
-                image_inputs, return_tensors="ms", max_patches=max_patch
-            ).flattened_patches
-            self.assertEqual(
-                encoded_images.shape,
-                (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
-            )
-
-
-@require_mindspore
-@require_vision
-class Pix2StructImageProcessingTestFourChannels(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = Pix2StructImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = Pix2StructImageProcessingTester(self, num_channels=4)
-        self.expected_encoded_image_num_channels = 3
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processor, "do_normalize"))
-        self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
-
-    def test_call_pil(self):
-        # Initialize image_processor
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        expected_hidden_dim = (
-            (self.image_processor_tester.patch_size["height"] * self.image_processor_tester.patch_size["width"])
-            * (self.image_processor_tester.num_channels - 1)
-        ) + 2
-
-        for max_patch in self.image_processor_tester.max_patches:
-            # Test not batched input
-            encoded_images = image_processor(
-                image_inputs[0], return_tensors="ms", max_patches=max_patch
-            ).flattened_patches
-            self.assertEqual(
-                encoded_images.shape,
-                (1, max_patch, expected_hidden_dim),
-            )
-
-            # Test batched
-            encoded_images = image_processor(
-                image_inputs, return_tensors="ms", max_patches=max_patch
-            ).flattened_patches
-            self.assertEqual(
-                encoded_images.shape,
-                (self.image_processor_tester.batch_size, max_patch, expected_hidden_dim),
-            )
-
-    @unittest.skip(reason="Pix2StructImageProcessor does not support 4 channels yet")  # FIXME Amy
-    def test_call_numpy(self):
-        return super().test_call_numpy()
-
-    @unittest.skip(reason="Pix2StructImageProcessor does not support 4 channels yet")  # FIXME Amy
-    def test_call_pytorch(self):
-        return super().test_call_torch()
-
-    @unittest.skip(
-        reason="Pix2StructImageProcessor does treat numpy and PIL 4 channel images consistently"
-    )  # FIXME Amy
-    def test_call_numpy_4_channels(self):
-        return super().test_call_torch()
diff --git a/tests/transformers/models/pix2struct/test_modeling_pix2struct.py b/tests/transformers/models/pix2struct/test_modeling_pix2struct.py
deleted file mode 100644
index b0e2b7f00..000000000
--- a/tests/transformers/models/pix2struct/test_modeling_pix2struct.py
+++ /dev/null
@@ -1,846 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Pix2Struct model."""
-
-import copy
-import inspect
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import requests
-
-from mindnlp.transformers import Pix2StructConfig, Pix2StructTextConfig, Pix2StructVisionConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        Pix2StructForConditionalGeneration,
-        Pix2StructProcessor,
-        Pix2StructTextModel,
-        Pix2StructVisionModel,
-    )
-
-
-if is_vision_available():
-    from PIL import Image
-
-class Pix2StructVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        hidden_size=12,
-        patch_embed_hidden_size=12,
-        projection_dim=32,
-        max_patches=64,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=1e-10,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_embed_hidden_size = patch_embed_hidden_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.max_patches = max_patches
-        self.seq_length = self.max_patches
-        self.patch_proj_dim = ((patch_size**2) * num_channels) + 2
-
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        flattened_patches = floats_tensor([self.batch_size, self.max_patches, self.patch_proj_dim])
-        config = self.get_config()
-
-        return config, flattened_patches
-
-    def get_config(self):
-        return Pix2StructVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-            patch_embed_hidden_size=self.patch_embed_hidden_size,
-        )
-
-    def create_and_check_model(self, config, flattened_patches):
-        model = Pix2StructVisionModel(config=config)
-        model.eval()
-        with no_grad():
-            result = model(flattened_patches)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, flattened_patches = config_and_inputs
-        inputs_dict = {
-            "flattened_patches": flattened_patches,
-            "attention_mask": ops.randint(0, 2, (self.batch_size, self.max_patches)),
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class Pix2StructVisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Pix2Struct does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (Pix2StructVisionModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = Pix2StructVisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=Pix2StructVisionConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="Pix2StructVision does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["flattened_patches"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="Pix2StructVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Pix2StructVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/pix2struct-textcaps-base"
-        model = Pix2StructVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class Pix2StructTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=12,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=12,
-        projection_dim=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        bos_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.d_kv = hidden_size // num_attention_heads
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[batch_idx, :int(start_index)] = 1
-                input_mask[batch_idx, int(start_index):] = 0
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return Pix2StructTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            bos_token_id=self.bos_token_id,
-            d_kv=self.d_kv,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = Pix2StructTextModel(config=config)
-        model.eval()
-        with no_grad():
-            result = model(input_ids, attention_mask=input_mask)
-            result = model(input_ids)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class Pix2StructTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (Pix2StructTextModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = Pix2StructTextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Pix2StructTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="Pix2Struct does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Pix2StructTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Pix2StructTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/pix2struct-textcaps-base"
-        model = Pix2StructTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class Pix2StructModelTester:
-    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.text_model_tester = Pix2StructTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = Pix2StructVisionModelTester(parent, **vision_kwargs)
-        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, flattened_patches = self.vision_model_tester.prepare_config_and_inputs()
-
-        config = self.get_config(text_config, vision_config)
-
-        return config, input_ids, attention_mask, flattened_patches
-
-    def get_config(self, text_config, vision_config):
-        return Pix2StructConfig.from_text_vision_configs(text_config, vision_config, projection_dim=64)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, decoder_attention_mask, flattened_patches = config_and_inputs
-
-        attention_mask = (ops.sum(flattened_patches, dim=-1) != 0).float()
-
-        inputs_dict = {
-            "decoder_input_ids": input_ids,
-            "labels": input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "flattened_patches": flattened_patches,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class Pix2StructModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (Pix2StructForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"image-to-text": Pix2StructForConditionalGeneration} if is_mindspore_available() else {}
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = True
-    test_attention_outputs = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = Pix2StructModelTester(self)
-
-    def test_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            output = model(**input_dict)
-            self.assertEqual(
-                output[1].shape,
-                (
-                    self.model_tester.vision_model_tester.batch_size,
-                    self.model_tester.text_model_tester.seq_length,
-                    self.model_tester.text_model_tester.vocab_size,
-                ),
-            )
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="Pix2StructModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "flattened_patches",
-                "attention_mask",
-                "decoder_input_ids",
-                "decoder_attention_mask",
-                "head_mask",
-                "decoder_head_mask",
-                "cross_attn_head_mask",
-                "encoder_outputs",
-                "past_key_values",
-                "labels",
-                "decoder_inputs_embeds",
-                "use_cache",
-            ]
-
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="model_tester.is_training is set to False")
-
-        for model_class in self.all_model_classes[:-1]:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            model = model_class(config)
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-            # hardcode labels to be the same as input_ids
-            inputs["labels"] = inputs["input_ids"]
-
-            loss = model(**inputs).loss
-            loss.backward()
-
-    def test_training_gradient_checkpointing(self):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="model_tester.is_training is set to False")
-
-        for model_class in self.all_model_classes[:-1]:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.use_cache = False
-            config.return_dict = True
-
-            model = model_class(config)
-            model.gradient_checkpointing_enable()
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-            # hardcode labels to be the same as input_ids
-            inputs["labels"] = inputs["input_ids"]
-
-            loss = model(**inputs).loss
-            loss.backward()
-
-    # override as the `logit_scale` parameter initilization is different for Pix2Struct
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite because `vocab_size` is not an attribute of `Pix2StructConfig` but rather `Pix2StructTextConfig`
-    def test_resize_tokens_embeddings(self):
-        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            self.skipTest(reason="test_resize_embeddings is set to False")
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-    
-            if self.model_tester.is_training is False:
-                model.eval()
-
-            model_vocab_size = config.text_config.vocab_size
-            # Retrieve the embeddings and clone theme
-            model_embed = model.resize_token_embeddings(model_vocab_size)
-            cloned_embeddings = model_embed.weight.clone()
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Decoder input ids should be clamped to the maximum size of the vocabulary
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = inputs_dict["decoder_input_ids"].clamp(max=model_vocab_size - 15 - 1)
-                inputs_dict["labels"] = inputs_dict["labels"].clamp(max=model_vocab_size - 15 - 1)
-
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                if p1.ne(p2).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-    # overwrite because `vocab_size` is not an attribute of `Pix2StructConfig` but rather `Pix2StructTextConfig`
-    def test_resize_embeddings_untied(self):
-        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            self.skipTest(reason="test_resize_embeddings is set to False")
-
-        original_config.tie_word_embeddings = False
-
-        # if model cannot untied embeddings -> leave test
-        if original_config.tie_word_embeddings:
-            self.skipTest(reason="Model cannot untie embeddings")
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            # if no output embeddings -> leave test
-            if model.get_output_embeddings() is None:
-                continue
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_vocab_size = config.text_config.vocab_size
-            model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Decoder input ids should be clamped to the maximum size of the vocabulary
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = inputs_dict["decoder_input_ids"].clamp(max=model_vocab_size - 15 - 1)
-                inputs_dict["labels"] = inputs_dict["labels"].clamp(max=model_vocab_size - 15 - 1)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-    @unittest.skip(reason="Pix2Struct doesn't use tied weights")
-    def test_tied_model_weights_key_ignore(self):
-        pass
-
-    def _create_and_check_torchscript(self, config, inputs_dict):
-        if not self.test_torchscript:
-            self.skipTest(reason="test_torchscript is set to False")
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.torchscript = True
-        configs_no_init.return_dict = False
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.eval()
-
-            try:
-                input_ids = inputs_dict["input_ids"]
-                flattened_patches = inputs_dict["flattened_patches"]  # Pix2Struct needs flattened_patches
-                traced_model = ops.jit.trace(model, (input_ids, flattened_patches))
-            except RuntimeError:
-                self.fail("Couldn't trace module.")
-
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
-
-                try:
-                    ops.jit.save(traced_model, pt_file_name)
-                except Exception:
-                    self.fail("Couldn't save module.")
-
-                try:
-                    loaded_model = ops.jit.load(pt_file_name)
-                except Exception:
-                    self.fail("Couldn't load module.")
-
-                model.eval()
-
-            loaded_model.eval()
-
-            model_state_dict = model.state_dict()
-            loaded_model_state_dict = loaded_model.state_dict()
-
-            non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
-                    non_persistent_buffers[key] = loaded_model_state_dict[key]
-
-            loaded_model_state_dict = {
-                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
-            }
-
-            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
-
-            model_buffers = list(model.buffers())
-            for non_persistent_buffer in non_persistent_buffers.values():
-                found_buffer = False
-                for i, model_buffer in enumerate(model_buffers):
-                    if ops.equal(non_persistent_buffer, model_buffer):
-                        found_buffer = True
-                        break
-
-                self.assertTrue(found_buffer)
-                model_buffers.pop(i)
-
-            models_equal = True
-            for layer_name, p1 in model_state_dict.items():
-                p2 = loaded_model_state_dict[layer_name]
-                if p1.ne(p2).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save Pix2StructConfig and check if we can load Pix2StructVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = Pix2StructVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save Pix2StructConfig and check if we can load Pix2StructTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = Pix2StructTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
-
-
-# We will verify our results on an image of a stop sign
-def prepare_img():
-    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@require_vision
-@require_mindspore
-@slow
-class Pix2StructIntegrationTest(unittest.TestCase):
-    def test_inference_image_captioning(self):
-        model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")
-        processor = Pix2StructProcessor.from_pretrained("google/pix2struct-textcaps-base")
-        image = prepare_img()
-
-        # image only
-        inputs = processor(images=image, return_tensors="ms")
-
-        predictions = model.generate(**inputs)
-
-        self.assertEqual(
-            processor.decode(predictions[0], skip_special_tokens=True), "A stop sign is on a street corner."
-        )
-
-    def test_batched_inference_image_captioning(self):
-        model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")
-        processor = Pix2StructProcessor.from_pretrained("google/pix2struct-textcaps-base")
-        image_1 = prepare_img()
-
-        second_url = (
-            "https://www.connollycove.com/wp-content/uploads/2019/06/temple-bar-dublin-world-famous-irish-pub.jpg"
-        )
-        image_2 = Image.open(requests.get(second_url, stream=True).raw)
-
-        # image only
-        inputs = processor(images=[image_1, image_2], return_tensors="ms")
-
-        predictions = model.generate(**inputs)
-
-        self.assertEqual(
-            processor.decode(predictions[0], skip_special_tokens=True), "A stop sign is on a street corner."
-        )
-
-        self.assertEqual(
-            processor.decode(predictions[1], skip_special_tokens=True),
-            "A row of books including The Temple Bar and Guiness.",
-        )
-
-    def test_batched_inference_image_captioning_conditioned(self):
-        model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")
-        processor = Pix2StructProcessor.from_pretrained("google/pix2struct-textcaps-base")
-        image_1 = prepare_img()
-
-        second_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg"
-        image_2 = Image.open(requests.get(second_url, stream=True).raw)
-        texts = ["A picture of", "An photography of"]
-
-        # image only
-        inputs = processor(images=[image_1, image_2], text=texts, return_tensors="ms", add_special_tokens=False)
-
-        predictions = model.generate(**inputs)
-
-        self.assertEqual(
-            processor.decode(predictions[0], skip_special_tokens=True),
-            "A picture of a stop sign with a red stop sign",
-        )
-
-        self.assertEqual(
-            processor.decode(predictions[1], skip_special_tokens=True),
-            "An photography of the Temple Bar and other places in the city.",
-        )
-
-    def test_vqa_model(self):
-        model_id = "google/pix2struct-ai2d-base"
-
-        image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
-        image = Image.open(requests.get(image_url, stream=True).raw)
-
-        model = Pix2StructForConditionalGeneration.from_pretrained(model_id, torch_dtype=mindspore.bfloat16)
-        processor = Pix2StructProcessor.from_pretrained(model_id)
-
-        # image only
-        text = "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"
-
-        inputs = processor(images=image, return_tensors="ms", text=text).to(mindspore.bfloat16)
-
-        predictions = model.generate(**inputs)
-        self.assertEqual(processor.decode(predictions[0], skip_special_tokens=True), "ash cloud")
-
-    def test_vqa_model_batched(self):
-        model_id = "google/pix2struct-ai2d-base"
-
-        image_urls = [
-            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
-            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo-2.png",
-        ]
-
-        images = [Image.open(requests.get(image_url, stream=True).raw) for image_url in image_urls]
-
-        texts = [
-            "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
-            "What is the producer in the diagram? (1) Phytoplankton (2) Zooplankton (3) Large fish (4) Small fish",
-        ]
-
-        model = Pix2StructForConditionalGeneration.from_pretrained(model_id, torch_dtype=mindspore.bfloat16)
-        processor = Pix2StructProcessor.from_pretrained(model_id)
-
-        inputs = processor(images=images, return_tensors="ms", text=texts).to(mindspore.bfloat16)
-
-        predictions = model.generate(**inputs)
-        self.assertEqual(processor.decode(predictions[0], skip_special_tokens=True), "ash cloud")
-        self.assertEqual(processor.decode(predictions[1], skip_special_tokens=True), "Phytoplankton")
\ No newline at end of file
diff --git a/tests/transformers/models/pix2struct/test_processor_pix2struct.py b/tests/transformers/models/pix2struct/test_processor_pix2struct.py
deleted file mode 100644
index 51b4e658e..000000000
--- a/tests/transformers/models/pix2struct/test_processor_pix2struct.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-import pytest
-
-from mindnlp.utils import is_vision_available
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import (
-        AutoProcessor,
-        Pix2StructImageProcessor,
-        Pix2StructProcessor,
-        PreTrainedTokenizerFast,
-        T5Tokenizer,
-    )
-
-
-@require_vision
-@require_mindspore
-class Pix2StructProcessorTest(unittest.TestCase):
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-
-        image_processor = Pix2StructImageProcessor()
-        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-        processor = Pix2StructProcessor(image_processor, tokenizer)
-
-        processor.save_pretrained(self.tmpdirname)
-
-    def get_tokenizer(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
-
-    def get_image_processor(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def prepare_image_inputs(self):
-        """
-        This function prepares a list of random PIL images of the same fixed size.
-        """
-
-        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
-
-        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        return image_inputs
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = Pix2StructProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
-
-        processor = Pix2StructProcessor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, PreTrainedTokenizerFast)
-
-        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, Pix2StructImageProcessor)
-
-    def test_image_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        image_input = self.prepare_image_inputs()
-
-        input_feat_extract = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_tokenizer(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tok = tokenizer(input_str, return_token_type_ids=False, add_special_tokens=True)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertListEqual(
-            list(inputs.keys()), ["flattened_patches", "attention_mask", "decoder_attention_mask", "decoder_input_ids"]
-        )
-
-        # test if it raises when no input is passed
-        with pytest.raises(ValueError):
-            processor()
-
-    def test_processor_max_patches(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        max_patches = [512, 1024, 2048, 4096]
-        expected_hidden_size = [770, 770, 770, 770]
-        # with text
-        for i, max_patch in enumerate(max_patches):
-            inputs = processor(text=input_str, images=image_input, max_patches=max_patch)
-            self.assertEqual(inputs["flattened_patches"][0].shape[0], max_patch)
-            self.assertEqual(inputs["flattened_patches"][0].shape[1], expected_hidden_size[i])
-
-        # without text input
-        for i, max_patch in enumerate(max_patches):
-            inputs = processor(images=image_input, max_patches=max_patch)
-            self.assertEqual(inputs["flattened_patches"][0].shape[0], max_patch)
-            self.assertEqual(inputs["flattened_patches"][0].shape[1], expected_hidden_size[i])
-
-    def test_tokenizer_decode(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_model_input_names(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        # For now the processor supports only ["flattened_patches", "input_ids", "attention_mask", "decoder_attention_mask"]
-        self.assertListEqual(
-            list(inputs.keys()), ["flattened_patches", "attention_mask", "decoder_attention_mask", "decoder_input_ids"]
-        )
-
-        inputs = processor(text=input_str)
-
-        # For now the processor supports only ["flattened_patches", "input_ids", "attention_mask", "decoder_attention_mask"]
-        self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask"])
-        
diff --git a/tests/transformers/models/plbart/__init__.py b/tests/transformers/models/plbart/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/plbart/test_modeling_plbart.py b/tests/transformers/models/plbart/test_modeling_plbart.py
deleted file mode 100644
index 7a02130fe..000000000
--- a/tests/transformers/models/plbart/test_modeling_plbart.py
+++ /dev/null
@@ -1,666 +0,0 @@
-# coding=utf-8
-# Copyright 2022, The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch PLBART model."""
-
-import copy
-import tempfile
-import unittest
-import numpy as np
-from mindnlp.utils import cached_property
-from mindnlp.transformers import PLBartConfig
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    is_mindspore_available,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        AutoTokenizer,
-        PLBartForCausalLM,
-        PLBartForConditionalGeneration,
-        PLBartForSequenceClassification,
-        PLBartModel,
-    )
-    from mindnlp.transformers.models.plbart.modeling_plbart import PLBartDecoder, PLBartEncoder
-
-
-def prepare_plbart_inputs_dict(
-    config,
-    input_ids,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_ids.ne(config.pad_token_id)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(config.encoder_layers, config.encoder_attention_heads)
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    return {
-        "input_ids": input_ids,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-class PLBartModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=100,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-        inputs_dict = prepare_plbart_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, inputs_dict
-
-    def get_config(self):
-        return PLBartConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = PLBartModel(config=config).get_decoder().set_train(False)
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-        head_mask = inputs_dict["head_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, head_mask=head_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask,  next_attn_mask.astype(ms.bool_)], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_with_past_key_values = model(
-            next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values
-        )
-        output_from_past = output_with_past_key_values["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = PLBartModel(config=config).set_train(False)
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = PLBartEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
-            0
-        ]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = PLBartDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=inputs_dict["attention_mask"],
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-@require_mindspore
-class PLBartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (PLBartModel, PLBartForConditionalGeneration, PLBartForSequenceClassification) if is_mindspore_available() else ()
-    )
-    all_generative_model_classes = (PLBartForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": PLBartModel,
-            "summarization": PLBartForConditionalGeneration,
-            "text-classification": PLBartForSequenceClassification,
-            "text-generation": PLBartForCausalLM,
-            "text2text-generation": PLBartForConditionalGeneration,
-            "translation": PLBartForConditionalGeneration,
-            "zero-shot": PLBartForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    fx_compatible = False  # Fix me Michael
-    test_pruning = False
-    test_missing_keys = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "TranslationPipelineTests":
-            # Get `ValueError: Translation requires a `src_lang` and a `tgt_lang` for this model`.
-            # `PLBartConfig` was never used in pipeline tests: cannot create a simple tokenizer.
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = PLBartModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=PLBartConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    # PLBartForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (PLBartModel, PLBartForConditionalGeneration):
-            model = model_class(config)
-            model.set_train(False)
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            model(**inputs)[0]
-
-
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = PLBartForConditionalGeneration(config).set_train(False)
-        model.half()
-        model.generate(input_ids, attention_mask=attention_mask)
-        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    @unittest.skip(reason="Failing since #26752")
-    def test_sample_generate(self):
-        pass
-
-
-def assert_tensors_close(a, b, atol=1e-12, prefix=""):
-    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
-    if a is None and b is None:
-        return True
-    try:
-        if np.allclose(a.asnumpy(), b.asnumpy(), atol=atol):
-            return True
-        raise
-    except Exception:
-        pct_different = (ops.gt((a - b).abs(), atol)).float().mean().item()
-        if a.numel() > 100:
-            msg = f"tensor values are {pct_different:.1%} percent different."
-        else:
-            msg = f"{a} != {b}"
-        if prefix:
-            msg = prefix + ": " + msg
-        raise AssertionError(msg)
-
-
-def _long_tensor(tok_lst):
-    return ms.tensor(tok_lst, dtype=ms.int64)
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class AbstractSeq2SeqIntegrationTest(unittest.TestCase):
-    maxDiff = 1000  # longer string compare tracebacks
-    checkpoint_name = None
-
-    @classmethod
-    def setUpClass(cls):
-        cls.tokenizer = AutoTokenizer.from_pretrained(cls.checkpoint_name, use_fast=False)
-        return cls
-
-    @cached_property
-    def model(self):
-        """Only load the model if needed."""
-        model = PLBartForConditionalGeneration.from_pretrained(self.checkpoint_name)
-        return model
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class PLBartJavaCsIntegrationTest(AbstractSeq2SeqIntegrationTest):
-    checkpoint_name = "uclanlp/plbart-java-cs"
-    src_text = [
-        "public int maximum(int a, int b, int c){return Math.max(a, Math.max(b, c));}",
-        "public int product(int a, int b, int c){return a*b*c;}",
-    ]
-    tgt_text = [
-        "public int maximum(int a, int b, int c){return Math.Max(",
-        "public int Product(int a, int b, int c){return a * b *",
-    ]
-
-    @slow
-    def test_java_cs_generate_one(self):
-        batch = self.tokenizer(
-            ["public int maximum(int a, int b, int c){return Math.max(a, Math.max(b, c));}"], return_tensors="ms"
-        )
-        # batch = batch
-        translated_tokens = self.model.generate(**batch)
-        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
-        self.assertEqual(self.tgt_text[0], decoded[0])
-        # self.assertEqual(self.tgt_text[1], decoded[1])
-
-    @slow
-    def test_java_cs_generate_batch(self):
-        batch = self.tokenizer(self.src_text, return_tensors="ms", padding=True, truncation=True)
-        # batch = batch
-        translated_tokens = self.model.generate(**batch)
-        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
-        assert self.tgt_text == decoded
-
-    def test_plbart_java_cs_config(self):
-        plbart_models = ["uclanlp/plbart-java-cs"]
-        expected = {"scale_embedding": True}
-        for name in plbart_models:
-            config = PLBartConfig.from_pretrained(name)
-            for k, v in expected.items():
-                try:
-                    self.assertEqual(v, getattr(config, k))
-                except AssertionError as e:
-                    e.args += (name, k)
-                    raise
-
-    def test_plbart_fast_forward(self):
-        config = PLBartConfig(
-            vocab_size=99,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            add_final_layer_norm=True,
-        )
-        lm_model = PLBartForConditionalGeneration(config)
-        context = ms.tensor(
-            [[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]], dtype=ms.int64
-        )
-        summary = ms.tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]], dtype=ms.int64)
-        result = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary)
-        expected_shape = (*summary.shape, config.vocab_size)
-        self.assertEqual(result.logits.shape, expected_shape)
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class PLBartBaseIntegrationTest(AbstractSeq2SeqIntegrationTest):
-    checkpoint_name = "uclanlp/plbart-base"
-    src_text = ["Is 0 the first Fibonacci number ?", "Find the sum of all prime numbers ."]
-    tgt_text = ["0 the first Fibonacci number?", "the sum of all prime numbers.......... the the"]
-
-    def test_base_generate(self):
-        inputs = self.tokenizer([self.src_text[0]], return_tensors="ms")
-        src_lan = self.tokenizer._convert_lang_code_special_format("en_XX")
-        translated_tokens = self.model.generate(
-            input_ids=inputs["input_ids"],
-            decoder_start_token_id=self.tokenizer.lang_code_to_id[src_lan],
-        )
-        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
-        self.assertEqual(self.tgt_text[0], decoded[0])
-
-    @slow
-    def test_fill_mask(self):
-        inputs = self.tokenizer(["Is 0 the <mask> Fibonacci <mask> ?"], return_tensors="ms")
-        src_lan = self.tokenizer._convert_lang_code_special_format("en_XX")
-        outputs = self.model.generate(
-            inputs["input_ids"], decoder_start_token_id=self.tokenizer.lang_code_to_id[src_lan], num_beams=1
-        )
-        prediction: str = self.tokenizer.batch_decode(
-            outputs, clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )[0]
-        self.assertEqual(prediction, "0 0 the 0 the 0 the 0 the 0 the 0 the 0 the 0 the")
-
-
-class PLBartStandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        d_model=16,
-        decoder_seq_length=7,
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=2,
-        decoder_ffn_dim=32,
-        decoder_layers=2,
-        encoder_attention_heads=4,
-        decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=False,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.is_encoder_decoder = is_encoder_decoder
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = PLBartConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_attention_heads=self.encoder_attention_heads,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (config, input_ids, attention_mask, lm_labels)
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = PLBartDecoder(config=config).set_train(False)
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        model = PLBartDecoder(config=config).set_train(False)
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=ms.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=ms.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, input_ids, attention_mask, lm_labels) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class PLBartStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (PLBartDecoder, PLBartForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (PLBartForCausalLM,) if is_mindspore_available() else ()
-    test_pruning = False
-    is_encoder_decoder = False
-
-    def setUp(self):
-        self.model_tester = PLBartStandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=PLBartConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_attn_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    @unittest.skip(reason="Decoder cannot keep gradients")
-    def test_retain_grad_hidden_states_attentions(self):
-        return
diff --git a/tests/transformers/models/poolformer/__init__.py b/tests/transformers/models/poolformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/poolformer/test_image_processing_poolformer.py b/tests/transformers/models/poolformer/test_image_processing_poolformer.py
deleted file mode 100644
index 8cc363833..000000000
--- a/tests/transformers/models/poolformer/test_image_processing_poolformer.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PoolFormer model image processing."""
-
-import unittest
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils import is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_vision_available():
-    from mindnlp.transformers import PoolFormerImageProcessor
-
-
-class PoolFormerImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize_and_center_crop=True,
-        size=None,
-        crop_pct=0.9,
-        crop_size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"shortest_edge": 30}
-        crop_size = crop_size if crop_size is not None else {
-            "height": 30, "width": 30}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize_and_center_crop = do_resize_and_center_crop
-        self.size = size
-        self.crop_pct = crop_pct
-        self.crop_size = crop_size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_image_processor_dict(self):
-        return {
-            "size": self.size,
-            "do_resize_and_center_crop": self.do_resize_and_center_crop,
-            "crop_pct": self.crop_pct,
-            "crop_size": self.crop_size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.crop_size["height"], self.crop_size["width"]
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class PoolFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = PoolFormerImageProcessor
-
-    def setUp(self):
-        self.image_processor_tester = PoolFormerImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(
-            **self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize_and_center_crop"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "crop_pct"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 30})
-        self.assertEqual(image_processor.crop_size, {
-                         "height": 30, "width": 30})
-
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42, crop_size=84)
-        self.assertEqual(image_processor.size, {"shortest_edge": 42})
-        self.assertEqual(image_processor.crop_size, {
-                         "height": 84, "width": 84})
diff --git a/tests/transformers/models/poolformer/test_modeling_poolformer.py b/tests/transformers/models/poolformer/test_modeling_poolformer.py
deleted file mode 100644
index a1032da58..000000000
--- a/tests/transformers/models/poolformer/test_modeling_poolformer.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch PoolFormer model."""
-
-import unittest
-
-import numpy as np
-
-from mindnlp.utils import is_mindspore_available, is_vision_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore as ms
-
-    from mindnlp.transformers import MODEL_MAPPING, PoolFormerConfig, PoolFormerForImageClassification, PoolFormerModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import PoolFormerImageProcessor
-
-
-class PoolFormerConfigTester(ConfigTester):
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, "hidden_sizes"))
-        self.parent.assertTrue(hasattr(config, "num_encoder_blocks"))
-
-
-class PoolFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=64,
-        num_channels=3,
-        num_encoder_blocks=4,
-        depths=[2, 2, 2, 2],
-        sr_ratios=[8, 4, 2, 1],
-        hidden_sizes=[16, 32, 64, 128],
-        downsampling_rates=[1, 4, 8, 16],
-        is_training=False,
-        use_labels=True,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.num_encoder_blocks = num_encoder_blocks
-        self.sr_ratios = sr_ratios
-        self.depths = depths
-        self.hidden_sizes = hidden_sizes
-        self.downsampling_rates = downsampling_rates
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor(
-                [self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = PoolFormerConfig(
-            image_size=self.image_size,
-            num_channels=self.num_channels,
-            num_encoder_blocks=self.num_encoder_blocks,
-            depths=self.depths,
-            hidden_sizes=self.hidden_sizes,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, pixel_values, labels
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = PoolFormerModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        expected_height = expected_width = self.image_size // 32.0
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (
-                self.batch_size, self.hidden_sizes[-1], expected_height, expected_width)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class PoolFormerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        PoolFormerModel, PoolFormerForImageClassification) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"image-feature-extraction": PoolFormerModel,
-            "image-classification": PoolFormerForImageClassification}
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = PoolFormerModelTester(self)
-        self.config_tester = PoolFormerConfigTester(
-            self, config_class=PoolFormerConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="PoolFormer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="PoolFormer does not have get_input_embeddings method and get_output_embeddings methods")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(
-                **self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_layers = self.model_tester.num_encoder_blocks
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            # verify the first hidden states (first block)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-3:]),
-                [
-                    self.model_tester.hidden_sizes[0],
-                    self.model_tester.image_size // 4,
-                    self.model_tester.image_size // 4,
-                ],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            if model_class in MODEL_MAPPING.values():
-                continue
-            model = model_class(config)
-            model.train()
-            inputs = self._prepare_for_class(
-                inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            loss.backward()
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "sail/poolformer_s12"
-        model = PoolFormerModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-class PoolFormerModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_image_classification_head(self):
-        image_processor = PoolFormerImageProcessor()
-        model = PoolFormerForImageClassification.from_pretrained(
-            "sail/poolformer_s12")
-
-        inputs = image_processor(images=prepare_img(), return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = ms.Tensor([-0.6113, 0.1685, -0.0492])
-        self.assertTrue(np.allclose(
-            outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/pop2piano/__init__.py b/tests/transformers/models/pop2piano/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/pop2piano/test_modeling_pop2piano.py b/tests/transformers/models/pop2piano/test_modeling_pop2piano.py
deleted file mode 100644
index 90f554949..000000000
--- a/tests/transformers/models/pop2piano/test_modeling_pop2piano.py
+++ /dev/null
@@ -1,754 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Pop2Piano model."""
-
-import copy
-import tempfile
-import unittest
-
-import numpy as np
-
-from mindnlp.dataset import load_dataset
-from mindnlp.transformers import Pop2PianoConfig
-from mindnlp.transformers.feature_extraction_utils import BatchFeature
-from mindnlp.utils.testing_utils import (
-    require_essentia,
-    require_librosa,
-    require_scipy,
-    require_mindspore,
-    slow,
-)
-from mindnlp.utils import is_essentia_available, is_librosa_available, is_scipy_available, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-    from mindnlp.engine import set_seed
-
-    from mindnlp.transformers import Pop2PianoForConditionalGeneration
-
-
-@require_mindspore
-class Pop2PianoModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        decoder_seq_length=9,
-        # For common tests
-        is_training=False,
-        use_attention_mask=True,
-        use_labels=True,
-        hidden_size=64,
-        num_hidden_layers=5,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        eos_token_id=1,
-        pad_token_id=0,
-        decoder_start_token_id=0,
-        scope=None,
-        decoder_layers=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.scope = None
-        self.decoder_layers = decoder_layers
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        decoder_attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = (
-            ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size) if self.use_labels else None
-        )
-
-        return self.get_config(), input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels
-
-    def get_pipeline_config(self):
-        return Pop2PianoConfig(
-            vocab_size=166,  # Pop2Piano forces 100 extra tokens
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-        )
-
-    def get_config(self):
-        return Pop2PianoConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-        )
-
-    def check_prepare_lm_labels_via_shift_left(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = Pop2PianoForConditionalGeneration(config=config)
-        model.eval()
-
-        # make sure that lm_labels are correctly padded from the right
-        lm_labels = lm_labels.masked_fill((lm_labels == self.decoder_start_token_id), self.eos_token_id)
-
-        # add causal pad token mask
-        triangular_mask = ops.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
-        lm_labels = lm_labels.masked_fill(triangular_mask, self.pad_token_id)
-        decoder_input_ids = model._shift_right(lm_labels)
-
-        for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
-            # first item
-            self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
-            if i < decoder_input_ids_slice.shape[-1]:
-                if i < decoder_input_ids.shape[-1] - 1:
-                    # items before diagonal
-                    self.parent.assertListEqual(
-                        decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
-                    )
-                # pad items after diagonal
-                if i < decoder_input_ids.shape[-1] - 2:
-                    self.parent.assertListEqual(
-                        decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
-                    )
-            else:
-                # all items after square
-                self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = Pop2PianoForConditionalGeneration(config=config)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        decoder_past = result.past_key_values
-        encoder_output = result.encoder_last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-        # There should be `num_layers` key value embeddings stored in decoder_past
-        self.parent.assertEqual(len(decoder_past), config.num_layers)
-        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
-        self.parent.assertEqual(len(decoder_past[0]), 4)
-
-    def create_and_check_with_lm_head(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = Pop2PianoForConditionalGeneration(config=config).eval()
-        outputs = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            labels=lm_labels,
-        )
-        self.parent.assertEqual(len(outputs), 4)
-        self.parent.assertEqual(outputs["logits"].shape, (self.batch_size, self.decoder_seq_length, self.vocab_size))
-        self.parent.assertEqual(outputs["loss"].shape, ())
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = Pop2PianoForConditionalGeneration(config=config).get_decoder().eval()
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = Pop2PianoForConditionalGeneration(config=config).get_decoder()
-        model.eval()
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = Pop2PianoForConditionalGeneration(config=config).get_decoder().eval()
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_generate_with_past_key_values(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = Pop2PianoForConditionalGeneration(config=config).eval()
-        set_seed(0)
-        output_without_past_cache = model.generate(
-            input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
-        )
-        set_seed(0)
-        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
-        self.parent.assertTrue(ops.all(output_with_past_cache == output_without_past_cache))
-
-    def create_and_check_model_fp16_forward(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = Pop2PianoForConditionalGeneration(config=config).half().eval()
-        output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)[
-            "encoder_last_hidden_state"
-        ]
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-    def create_and_check_encoder_decoder_shared_weights(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        for model_class in [Pop2PianoForConditionalGeneration]:
-            set_seed(0)
-            model = model_class(config=config).eval()
-            # load state dict copies weights but does not tie them
-            model.encoder.load_state_dict(model.decoder.state_dict(), strict=False)
-
-            set_seed(0)
-            tied_config = copy.deepcopy(config)
-            tied_config.tie_encoder_decoder = True
-            tied_model = model_class(config=tied_config).eval()
-
-            model_result = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            tied_model_result = tied_model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            # check that models has less parameters
-            self.parent.assertLess(
-                sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
-            )
-            random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
-
-            # check that outputs are equal
-            self.parent.assertTrue(
-                ops.allclose(
-                    model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
-                )
-            )
-
-            # check that outputs after saving and loading are equal
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tied_model.save_pretrained(tmpdirname)
-                tied_model = model_class.from_pretrained(tmpdirname)
-                tied_model.eval()
-
-                # check that models has less parameters
-                self.parent.assertLess(
-                    sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
-                )
-                random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
-
-                tied_model_result = tied_model(
-                    input_ids=input_ids,
-                    decoder_input_ids=decoder_input_ids,
-                    attention_mask=attention_mask,
-                    decoder_attention_mask=decoder_attention_mask,
-                )
-
-                # check that outputs are equal
-                self.parent.assertTrue(
-                    ops.allclose(
-                        model_result[0][0, :, random_slice_idx],
-                        tied_model_result[0][0, :, random_slice_idx],
-                        atol=1e-4,
-                    )
-                )
-
-    def check_resize_embeddings_pop2piano_v1_1(
-        self,
-        config,
-    ):
-        prev_vocab_size = config.vocab_size
-
-        config.tie_word_embeddings = False
-        model = Pop2PianoForConditionalGeneration(config=config).eval()
-        model.resize_token_embeddings(prev_vocab_size - 10)
-
-        self.parent.assertEqual(model.get_input_embeddings().weight.shape[0], prev_vocab_size - 10)
-        self.parent.assertEqual(model.get_output_embeddings().weight.shape[0], prev_vocab_size - 10)
-        self.parent.assertEqual(model.config.vocab_size, prev_vocab_size - 10)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "use_cache": False,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class Pop2PianoModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (Pop2PianoForConditionalGeneration,) if is_mindspore_available() else ()
-    all_generative_model_classes = ()
-    pipeline_model_mapping = (
-        {"automatic-speech-recognition": Pop2PianoForConditionalGeneration} if is_mindspore_available() else {}
-    )
-    all_parallelizable_model_classes = ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = True
-    test_model_parallel = False
-    is_encoder_decoder = True
-
-    def setUp(self):
-        self.model_tester = Pop2PianoModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Pop2PianoConfig, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_shift_right(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_v1_1(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        # check that gated gelu feed forward and different word embeddings work
-        config = config_and_inputs[0]
-        config.tie_word_embeddings = False
-        config.feed_forward_proj = "gated-gelu"
-        self.model_tester.create_and_check_model(config, *config_and_inputs[1:])
-
-    def test_config_and_model_silu_gated(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config = config_and_inputs[0]
-        config.feed_forward_proj = "gated-silu"
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_with_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_past_with_attn_mask(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    def test_decoder_model_past_with_3d_attn_mask(self):
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = self.model_tester.prepare_config_and_inputs()
-
-        attention_mask = ids_tensor(
-            [self.model_tester.batch_size, self.model_tester.encoder_seq_length, self.model_tester.encoder_seq_length],
-            vocab_size=2,
-        )
-        decoder_attention_mask = ids_tensor(
-            [self.model_tester.batch_size, self.model_tester.decoder_seq_length, self.model_tester.decoder_seq_length],
-            vocab_size=2,
-        )
-
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        )
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_shared_weights(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
-
-    def test_model_fp16_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
-
-    def test_v1_1_resize_embeddings(self):
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        self.model_tester.check_resize_embeddings_pop2piano_v1_1(config)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "sweetcocoa/pop2piano"
-        model = Pop2PianoForConditionalGeneration.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_pass_with_input_features(self):
-        input_features = BatchFeature(
-            {
-                "input_features": ops.rand((75, 100, 512)).type(mindspore.float32),
-                "beatsteps": ops.randint(size=(1, 955), low=0, high=100).type(mindspore.float32),
-                "extrapolated_beatstep": ops.randint(size=(1, 900), low=0, high=100).type(mindspore.float32),
-            }
-        )
-        model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
-        model_opts = model.generate(input_features=input_features["input_features"], return_dict_in_generate=True)
-
-        self.assertEqual(model_opts.sequences.ndim, 2)
-
-    def test_pass_with_batched_input_features(self):
-        input_features = BatchFeature(
-            {
-                "input_features": ops.rand((220, 70, 512)).type(mindspore.float32),
-                "beatsteps": ops.randint(size=(5, 955), low=0, high=100).type(mindspore.float32),
-                "extrapolated_beatstep": ops.randint(size=(5, 900), low=0, high=100).type(mindspore.float32),
-                "attention_mask": ops.concatenate(
-                    [
-                        ops.ones([120, 70], dtype=mindspore.int32),
-                        ops.zeros([1, 70], dtype=mindspore.int32),
-                        ops.ones([50, 70], dtype=mindspore.int32),
-                        ops.zeros([1, 70], dtype=mindspore.int32),
-                        ops.ones([47, 70], dtype=mindspore.int32),
-                        ops.zeros([1, 70], dtype=mindspore.int32),
-                    ],
-                    dim=0,
-                ),
-                "attention_mask_beatsteps": ops.ones((5, 955)).type(mindspore.int32),
-                "attention_mask_extrapolated_beatstep": ops.ones((5, 900)).type(mindspore.int32),
-            }
-        )
-        model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
-        model_opts = model.generate(
-            input_features=input_features["input_features"],
-            attention_mask=input_features["attention_mask"],
-            return_dict_in_generate=True,
-        )
-
-        self.assertEqual(model_opts.sequences.ndim, 2)
-
-
-@require_mindspore
-class Pop2PianoModelIntegrationTests(unittest.TestCase):
-    @slow
-    def test_mel_conditioner_integration(self):
-        composer = "composer1"
-        model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
-        input_embeds = ops.ones([10, 100, 512])
-
-        composer_value = model.generation_config.composer_to_feature_token[composer]
-        composer_value = mindspore.tensor(composer_value)
-        composer_value = composer_value.repeat(input_embeds.size(0))
-        outputs = model.mel_conditioner(
-            input_embeds, composer_value, min(model.generation_config.composer_to_feature_token.values())
-        )
-
-        # check shape
-        self.assertEqual(outputs.shape, (10, 101, 512))
-
-        # check values
-        EXPECTED_OUTPUTS = mindspore.tensor(
-            [[1.0475305318832397, 0.29052114486694336, -0.47778210043907166], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]
-        )
-
-        self.assertTrue(ops.allclose(outputs[0, :3, :3], EXPECTED_OUTPUTS, atol=1e-4))
-
-    @slow
-    @require_essentia
-    @require_librosa
-    @require_scipy
-    def test_full_model_integration(self):
-        if is_librosa_available() and is_scipy_available() and is_essentia_available() and is_mindspore_available():
-            from transformers import Pop2PianoProcessor
-
-            speech_input1 = np.zeros([1_000_000], dtype=np.float32)
-            sampling_rate = 44_100
-
-            processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
-            input_features = processor.feature_extractor(
-                speech_input1, sampling_rate=sampling_rate, return_tensors="ms"
-            )
-
-            model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
-            outputs = model.generate(
-                input_features=input_features["input_features"], return_dict_in_generate=True
-            ).sequences
-
-            # check for shapes
-            self.assertEqual(outputs.size(0), 70)
-
-            # check for values
-            self.assertEqual(outputs[0, :2].cpu().numpy().tolist(), [0, 1])
-
-    # This is the test for a real music from K-Pop genre.
-    @slow
-    @require_essentia
-    @require_librosa
-    @require_scipy
-    def test_real_music(self):
-        if is_librosa_available() and is_scipy_available() and is_essentia_available() and is_mindspore_available():
-            from transformers import Pop2PianoFeatureExtractor, Pop2PianoTokenizer
-
-            model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano")
-            model.eval()
-            feature_extractor = Pop2PianoFeatureExtractor.from_pretrained("sweetcocoa/pop2piano")
-            tokenizer = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano")
-            ds = load_dataset("sweetcocoa/pop2piano_ci", split="test")
-
-            output_fe = feature_extractor(
-                ds["audio"][0]["array"], sampling_rate=ds["audio"][0]["sampling_rate"], return_tensors="ms"
-            )
-            output_model = model.generate(input_features=output_fe["input_features"], composer="composer1")
-            output_tokenizer = tokenizer.batch_decode(token_ids=output_model, feature_extractor_output=output_fe)
-            pretty_midi_object = output_tokenizer["pretty_midi_objects"][0]
-
-            # Checking if no of notes are same
-            self.assertEqual(len(pretty_midi_object.instruments[0].notes), 59)
-            predicted_timings = []
-            for i in pretty_midi_object.instruments[0].notes:
-                predicted_timings.append(i.start)
-
-            # Checking note start timings(first 6)
-            EXPECTED_START_TIMINGS = [
-                0.4876190423965454,
-                0.7314285635948181,
-                0.9752380847930908,
-                1.4396371841430664,
-                1.6718367338180542,
-                1.904036283493042,
-            ]
-
-            np.allclose(EXPECTED_START_TIMINGS, predicted_timings[:6])
-
-            # Checking note end timings(last 6)
-            EXPECTED_END_TIMINGS = [
-                12.341403007507324,
-                12.567797183990479,
-                12.567797183990479,
-                12.567797183990479,
-                12.794191360473633,
-                12.794191360473633,
-            ]
-
-            np.allclose(EXPECTED_END_TIMINGS, predicted_timings[-6:])
\ No newline at end of file
diff --git a/tests/transformers/models/prophetnet/__init__.py b/tests/transformers/models/prophetnet/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/prophetnet/test_modeling_prophetnet.py b/tests/transformers/models/prophetnet/test_modeling_prophetnet.py
deleted file mode 100644
index 08f554461..000000000
--- a/tests/transformers/models/prophetnet/test_modeling_prophetnet.py
+++ /dev/null
@@ -1,1287 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import tempfile
-import unittest
-
-from mindnlp.transformers import ProphetNetConfig, is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import (
-        ProphetNetDecoder,
-        ProphetNetEncoder,
-        ProphetNetForCausalLM,
-        ProphetNetForConditionalGeneration,
-        ProphetNetModel,
-        ProphetNetTokenizer,
-    )
-    from mindnlp.transformers.modeling_outputs import BaseModelOutput
-
-
-class ProphetNetModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        hidden_size=16,
-        encoder_seq_length=7,
-        decoder_seq_length=9,
-        # For common tests
-        is_training=True,
-        use_attention_mask=True,
-        use_labels=True,
-        decoder_start_token_id=0,
-        encoder_ffn_dim=32,
-        num_encoder_layers=2,
-        num_encoder_attention_heads=4,
-        decoder_ffn_dim=32,
-        num_decoder_layers=2,
-        num_decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        ngram=2,
-        num_buckets=32,
-        relative_max_distance=128,
-        disable_ngram_loss=False,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_decoder_layers
-        self.num_encoder_layers = num_encoder_layers
-        self.num_decoder_layers = num_decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_ffn_dim = encoder_ffn_dim
-        self.num_attention_heads = num_decoder_attention_heads
-        self.num_encoder_attention_heads = num_encoder_attention_heads
-        self.num_decoder_attention_heads = num_decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.ngram = ngram
-        self.num_buckets = num_buckets
-        self.relative_max_distance = relative_max_distance
-        self.disable_ngram_loss = disable_ngram_loss
-        self.max_position_embeddings = max_position_embeddings
-        self.is_encoder_decoder = is_encoder_decoder
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 7
-        self.num_hidden_states_types = 3  # encoder, decoder_main, decoder_ngram
-        self.decoder_attention_idx = 2
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        decoder_attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        )
-
-    def get_config(self):
-        return ProphetNetConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_encoder_layers=self.num_encoder_layers,
-            num_decoder_layers=self.num_decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_ffn_dim=self.encoder_ffn_dim,
-            num_encoder_attention_heads=self.num_encoder_attention_heads,
-            num_decoder_attention_heads=self.num_decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            ngram=self.ngram,
-            num_buckets=self.num_buckets,
-            relative_max_distance=self.relative_max_distance,
-            disable_ngram_loss=self.disable_ngram_loss,
-            max_position_embeddings=self.max_position_embeddings,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.encoder_seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-
-        return (
-            config,
-            decoder_input_ids,
-            decoder_attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            lm_labels,
-        )
-
-    def check_prepare_lm_labels_via_shift_left(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = ProphetNetModel(config=config)
-        model.eval()
-
-        # make sure that lm_labels are correctly padded from the right
-        lm_labels = lm_labels.masked_fill((lm_labels == self.decoder_start_token_id), self.eos_token_id)
-
-        # add casaul pad token mask
-        triangular_mask = ops.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
-        lm_labels = lm_labels.masked_fill(triangular_mask, self.pad_token_id)
-        decoder_input_ids = model._shift_right(lm_labels)
-
-        for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
-            # first item
-            self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
-            if i < decoder_input_ids_slice.shape[-1]:
-                if i < decoder_input_ids.shape[-1] - 1:
-                    # items before diagonal
-                    self.parent.assertListEqual(
-                        decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
-                    )
-                # pad items after diagonal
-                if i < decoder_input_ids.shape[-1] - 2:
-                    self.parent.assertListEqual(
-                        decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
-                    )
-            else:
-                # all items after square
-                self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = ProphetNetModel(config=config)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        decoder_output = result.last_hidden_state
-        decoder_past = result.past_key_values
-        encoder_output = result.encoder_last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-        self.parent.assertEqual(decoder_output.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size))
-        # There should be `num_layers` key value embeddings stored in decoder_past
-        self.parent.assertEqual(len(decoder_past), config.num_decoder_layers)
-        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
-        self.parent.assertEqual(len(decoder_past[0]), 4)  # cross-attention + uni-directional self-attention
-
-    def create_and_check_with_lm_head(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = ProphetNetForConditionalGeneration(config=config).eval()
-        outputs = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            labels=lm_labels,
-        )
-        self.parent.assertEqual(len(outputs), 5)
-        self.parent.assertEqual(outputs["logits"].shape, (self.batch_size, self.decoder_seq_length, self.vocab_size))
-        self.parent.assertEqual(outputs["loss"].shape, ())
-
-    def create_and_check_causal_lm_decoder(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = ProphetNetForCausalLM(config=config).eval()
-        outputs = model(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            labels=lm_labels,
-        )
-        self.parent.assertEqual(len(outputs), 4)
-        self.parent.assertEqual(outputs["logits"].shape, (self.batch_size, self.decoder_seq_length, self.vocab_size))
-        self.parent.assertEqual(outputs["loss"].shape, ())
-
-    def create_and_check_generate_with_past_key_value_states(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = ProphetNetForConditionalGeneration(config=config).eval()
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        output_without_past_cache = model.generate(
-            input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
-        )
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
-        self.parent.assertTrue(ops.all(output_with_past_cache == output_without_past_cache))
-
-    def create_and_check_decoder_generate_with_past_key_value_states(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = ProphetNetForCausalLM(config=config).eval()
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        output_without_past_cache = model.generate(
-            input_ids[:1], num_beams=2, max_length=10, do_sample=True, use_cache=False
-        )
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=10, do_sample=True)
-        self.parent.assertTrue(ops.all(output_with_past_cache == output_without_past_cache))
-
-    def create_and_check_model_fp16_forward(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = ProphetNetModel(config=config).half().eval()
-        output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"]
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-    def create_and_check_encoder_decoder_shared_weights(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        for model_class in [ProphetNetModel, ProphetNetForConditionalGeneration]:
-            mindspore.manual_seed(0)
-            mindspore.set_seed(0)
-            model = model_class(config=config).eval()
-            # load state dict copies weights but does not tie them
-
-            if model_class == ProphetNetForConditionalGeneration:
-                model.prophetnet.encoder.load_state_dict(model.prophetnet.decoder.state_dict(), strict=False)
-            else:
-                model.encoder.load_state_dict(model.decoder.state_dict(), strict=False)
-
-            mindspore.manual_seed(0)
-            mindspore.set_seed(0)
-            tied_config = copy.deepcopy(config)
-            tied_config.tie_encoder_decoder = True
-            tied_model = model_class(config=tied_config).eval()
-
-            model_result = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            tied_model_result = tied_model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            # check that models has less parameters
-            self.parent.assertLess(
-                sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
-            )
-            random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
-
-            # check that outputs are equal
-            self.parent.assertTrue(
-                ops.allclose(
-                    model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
-                )
-            )
-
-            # check that outputs after saving and loading are equal
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tied_model.save_pretrained(tmpdirname)
-                tied_model = model_class.from_pretrained(tmpdirname)
-                tied_model.eval()
-
-                # check that models has less parameters
-                self.parent.assertLess(
-                    sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
-                )
-                random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
-
-                tied_model_result = tied_model(
-                    input_ids=input_ids,
-                    decoder_input_ids=decoder_input_ids,
-                    attention_mask=attention_mask,
-                    decoder_attention_mask=decoder_attention_mask,
-                )
-
-                # check that outputs are equal
-                self.parent.assertTrue(
-                    ops.allclose(
-                        model_result[0][0, :, random_slice_idx],
-                        tied_model_result[0][0, :, random_slice_idx],
-                        atol=1e-4,
-                    )
-                )
-
-    def check_fast_integration(
-        self,
-        config,
-        *args,
-    ):
-        input_ids = mindspore.tensor([[7, 4, 78, 0, 24, 52, 43]], dtype=mindspore.int64)
-        decoder_input_ids = mindspore.tensor([[12, 62, 25, 11, 47, 15, 14]], dtype=mindspore.int64)
-        attention_mask = mindspore.tensor([[1, 1, 1, 0, 1, 0, 0]], dtype=mindspore.int64)
-        decoder_attention_mask = mindspore.tensor([[1, 1, 1, 0, 0, 1, 0]], dtype=mindspore.int64)
-        lm_labels = mindspore.tensor([[62, 25, 11, 47, 15, 14, 24]], dtype=mindspore.int64)
-        mindspore.manual_seed(2)
-        mindspore.set_seed(2)
-        config.ngram = 4
-        model = ProphetNetForConditionalGeneration(config=config)
-        model.eval()
-        with no_grad():
-            result = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-                labels=lm_labels,
-            )
-        self.parent.assertTrue(ops.allclose(result.loss, mindspore.tensor(4.5892), atol=1e-3))
-
-        expected_logit_slice = mindspore.tensor(
-            [-0.0184, 0.0758, -0.0543, -0.0093, 0.0050, -0.0660, -0.1453]
-        )
-        self.parent.assertTrue(ops.allclose(result.logits[0, :, 1], expected_logit_slice, atol=1e-3))
-
-    def check_model_with_attn_mask(self, config, input_ids, decoder_input_ids, *args):
-        model = ProphetNetModel(config=config)
-        model.eval()
-
-        outputs_no_mask = model(input_ids=input_ids[:, :5], decoder_input_ids=decoder_input_ids[:, :5])
-        attention_mask = ops.ones_like(input_ids)
-        decoder_attention_mask = ops.ones_like(decoder_input_ids)
-
-        attention_mask[:, 5:] = 0
-
-        outputs_with_mask = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-
-        # check encoder
-        self.parent.assertTrue(
-            ops.allclose(
-                outputs_no_mask.encoder_last_hidden_state[0, :, 0],
-                outputs_with_mask.encoder_last_hidden_state[0, :5, 0],
-                atol=1e-3,
-            )
-        )
-
-        # check decoder
-        # main stream
-        self.parent.assertTrue(
-            ops.allclose(
-                outputs_no_mask.last_hidden_state[0, :, 0], outputs_with_mask.last_hidden_state[0, :5, 0], atol=1e-3
-            )
-        )
-        # predict stream
-        self.parent.assertTrue(
-            ops.allclose(
-                outputs_no_mask.last_hidden_state_ngram[0, :5, 0],
-                outputs_with_mask.last_hidden_state_ngram[0, :5, 0],
-                atol=1e-2,
-            )
-        )
-
-    def check_causal_lm_from_pretrained(
-        self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, *args
-    ):
-        model = ProphetNetForConditionalGeneration(config).eval()
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            model.save_pretrained(tmp_dirname)
-            decoder = ProphetNetForCausalLM.from_pretrained(tmp_dirname)
-
-        encoder_hidden_states = model.prophetnet.encoder(input_ids).last_hidden_state
-
-        model_outputs = model(
-            encoder_outputs=BaseModelOutput(last_hidden_state=encoder_hidden_states),
-            decoder_input_ids=decoder_input_ids,
-        )
-        dec_outputs = decoder(encoder_hidden_states=encoder_hidden_states, input_ids=decoder_input_ids)
-
-        self.parent.assertTrue(
-            ops.allclose(
-                model_outputs.logits[0, :5],
-                dec_outputs.logits[0, :5],
-                atol=1e-3,
-            )
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "use_cache": False,
-        }
-        return config, inputs_dict
-
-
-class ProphetNetStandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        hidden_size=16,
-        encoder_seq_length=7,
-        decoder_seq_length=7,
-        # For common tests
-        is_training=True,
-        is_decoder=True,
-        use_attention_mask=True,
-        add_cross_attention=False,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=0,
-        encoder_ffn_dim=32,
-        num_encoder_layers=2,
-        num_encoder_attention_heads=4,
-        decoder_ffn_dim=32,
-        num_decoder_layers=2,
-        num_decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=False,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        ngram=2,
-        num_buckets=32,
-        relative_max_distance=128,
-        disable_ngram_loss=False,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_decoder_layers
-        self.num_encoder_layers = num_encoder_layers
-        self.num_decoder_layers = num_decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_ffn_dim = encoder_ffn_dim
-        self.num_attention_heads = num_decoder_attention_heads
-        self.num_encoder_attention_heads = num_encoder_attention_heads
-        self.num_decoder_attention_heads = num_decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.ngram = ngram
-        self.num_buckets = num_buckets
-        self.relative_max_distance = relative_max_distance
-        self.use_cache = use_cache
-        self.disable_ngram_loss = disable_ngram_loss
-        self.max_position_embeddings = max_position_embeddings
-        self.add_cross_attention = add_cross_attention
-        self.is_encoder_decoder = is_encoder_decoder
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.num_hidden_states_types = 2  # decoder_main, decoder_ngram
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-
-        config = ProphetNetConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_encoder_layers=self.num_encoder_layers,
-            num_decoder_layers=self.num_decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_ffn_dim=self.encoder_ffn_dim,
-            num_encoder_attention_heads=self.num_encoder_attention_heads,
-            num_decoder_attention_heads=self.num_decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            ngram=self.ngram,
-            num_buckets=self.num_buckets,
-            relative_max_distance=self.relative_max_distance,
-            disable_ngram_loss=self.disable_ngram_loss,
-            max_position_embeddings=self.max_position_embeddings,
-            add_cross_attention=self.add_cross_attention,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.encoder_seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            lm_labels,
-        )
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        config.use_cache = True
-        model = ProphetNetDecoder(config=config).eval()
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-        lm_labels,
-    ):
-        model = ProphetNetDecoder(config=config).eval()
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-class ProphetNetStandaloneEncoderModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        hidden_size=16,
-        encoder_seq_length=7,
-        decoder_seq_length=7,
-        # For common tests
-        is_training=True,
-        is_decoder=False,
-        use_attention_mask=True,
-        add_cross_attention=False,
-        use_cache=False,
-        use_labels=True,
-        decoder_start_token_id=0,
-        encoder_ffn_dim=32,
-        num_encoder_layers=2,
-        num_encoder_attention_heads=4,
-        decoder_ffn_dim=32,
-        num_decoder_layers=2,
-        num_decoder_attention_heads=4,
-        max_position_embeddings=30,
-        is_encoder_decoder=False,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        num_buckets=32,
-        relative_max_distance=128,
-        disable_ngram_loss=False,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_decoder_layers
-        self.num_encoder_layers = num_encoder_layers
-        self.num_decoder_layers = num_decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.encoder_ffn_dim = encoder_ffn_dim
-        self.num_attention_heads = num_decoder_attention_heads
-        self.num_encoder_attention_heads = num_encoder_attention_heads
-        self.num_decoder_attention_heads = num_decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.num_buckets = num_buckets
-        self.relative_max_distance = relative_max_distance
-        self.use_cache = use_cache
-        self.disable_ngram_loss = disable_ngram_loss
-        self.max_position_embeddings = max_position_embeddings
-        self.add_cross_attention = add_cross_attention
-        self.is_encoder_decoder = is_encoder_decoder
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 1
-        self.num_hidden_states_types = 1
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-
-        config = ProphetNetConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_encoder_layers=self.num_encoder_layers,
-            num_decoder_layers=self.num_decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            encoder_ffn_dim=self.encoder_ffn_dim,
-            num_encoder_attention_heads=self.num_encoder_attention_heads,
-            num_decoder_attention_heads=self.num_decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            num_buckets=self.num_buckets,
-            relative_max_distance=self.relative_max_distance,
-            disable_ngram_loss=self.disable_ngram_loss,
-            max_position_embeddings=self.max_position_embeddings,
-            add_cross_attention=self.add_cross_attention,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (ProphetNetModel, ProphetNetForConditionalGeneration) if is_mindspore_available() else ()
-    all_generative_model_classes = (ProphetNetForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": ProphetNetModel,
-            "summarization": ProphetNetForConditionalGeneration,
-            "text-generation": ProphetNetForCausalLM,
-            "text2text-generation": ProphetNetForConditionalGeneration,
-            "translation": ProphetNetForConditionalGeneration,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_resize_embeddings = False
-    is_encoder_decoder = True
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "TextGenerationPipelineTests":
-            # Get `ValueError: AttributeError: 'NoneType' object has no attribute 'new_ones'` or `AssertionError`.
-            # `ProphetNetConfig` was never used in pipeline tests: cannot create a simple
-            # tokenizer.
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = ProphetNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ProphetNetConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_lm_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
-
-    def test_only_decoder_causal_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causal_lm_decoder(*config_and_inputs)
-
-    def test_shared_weights(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
-
-    def test_shift_labels_via_shift_left(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs)
-
-    @unittest.skip(reason="Flaky test with no simple resolution. TODO Fix me @patrickvonplaten")
-    def test_decoder_model_generate(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_generate_with_past_key_value_states(*config_and_inputs)
-
-    def test_encoder_decoder_model_generate(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_generate_with_past_key_value_states(*config_and_inputs)
-
-    def test_attn_mask_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_model_with_attn_mask(*config_and_inputs)
-
-    def test_config_save(self):
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        config.add_cross_attention = False
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            config.save_pretrained(tmp_dirname)
-            config = ProphetNetConfig.from_pretrained(tmp_dirname)
-
-        self.assertFalse(config.add_cross_attention)
-
-    def test_causal_lm_from_pretrained(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_causal_lm_from_pretrained(*config_and_inputs)
-
-    def test_fp16_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
-
-    # methods overwrite method in `test_modeling_common.py`
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-            out_len = len(outputs)
-
-            correct_outlen = 7
-
-            # loss is at first position
-            if "labels" in inputs_dict:
-                correct_outlen += 1  # loss is added to beginning
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    (self.model_tester.ngram + 1) * decoder_seq_length,
-                    encoder_key_length,
-                ],
-            )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-    @unittest.skip(reason="Generating with head_masking has not been implemented for ProphetNet models yet.")
-    def test_generate_with_head_masking(self):
-        pass
-
-
-@require_mindspore
-class ProphetNetStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (ProphetNetDecoder, ProphetNetForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (ProphetNetForCausalLM,) if is_mindspore_available() else ()
-    test_pruning = False
-
-    test_resize_embeddings = False
-    is_encoder_decoder = False
-
-    def setUp(self):
-        self.model_tester = ProphetNetStandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=ProphetNetConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_attn_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    @unittest.skip(reason="Decoder cannot keep gradients")
-    def test_retain_grad_hidden_states_attentions(self):
-        return
-
-
-@require_mindspore
-class ProphetNetStandaloneEncoderModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (ProphetNetEncoder,) if is_mindspore_available() else ()
-    test_pruning = False
-
-    test_resize_embeddings = False
-    is_encoder_decoder = False
-
-    def setUp(self):
-        self.model_tester = ProphetNetStandaloneEncoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=ProphetNetConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-
-@require_mindspore
-class ProphetNetModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_pretrained_checkpoint_hidden_states(self):
-        model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")
-
-        # encoder-decoder outputs
-        encoder_ids = mindspore.tensor(
-            [
-                [
-                    2871,
-                    102,
-                    2048,
-                    3176,
-                    2780,
-                    1997,
-                    2871,
-                    26727,
-                    2169,
-                    2097,
-                    12673,
-                    1996,
-                    8457,
-                    2006,
-                    2049,
-                    8240,
-                    2859,
-                    2799,
-                    1012,
-                    2023,
-                    6512,
-                    2038,
-                    2174,
-                    13977,
-                    2195,
-                    25962,
-                    1012,
-                    102,
-                ]
-            ]
-        )
-
-        decoder_prev_ids = mindspore.tensor([[102, 2129, 2116, 2372, 2024, 2006, 2169, 1997, 2122, 2048, 2780, 1029]])
-        output = model(
-            input_ids=encoder_ids,
-            attention_mask=None,
-            encoder_outputs=None,
-            decoder_input_ids=decoder_prev_ids,
-        )
-        output_predited_logits = output[0]
-        expected_shape = (1, 12, 30522)
-        self.assertEqual(output_predited_logits.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[[-7.7729, -8.0343, -8.26001], [-7.74213, -7.8629, -8.6000], [-7.7328, -7.8269, -8.5264]]]
-        )
-        #        self.assertTrue(ops.allclose(output_predited_logits[:, :3, :3], expected_slice, atol=1e-4))
-        assert ops.allclose(output_predited_logits[:, :3, :3], expected_slice, atol=1e-4)
-
-        # encoder outputs
-        encoder_outputs = model.prophetnet.encoder(encoder_ids)[0]
-        expected_encoder_outputs_slice = mindspore.tensor(
-            [[[-0.2526, -0.1951, -0.2185], [-0.8923, 0.2992, -0.4623], [-0.4585, 0.0165, -0.6652]]]
-        )
-        expected_shape_encoder = (1, 28, 1024)
-        self.assertEqual(encoder_outputs.shape, expected_shape_encoder)
-        #        self.assertTrue(ops.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4))
-        assert ops.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4)
-
-        # decoder outputs
-        decoder_outputs = model.prophetnet.decoder(decoder_prev_ids, encoder_hidden_states=encoder_outputs)
-        predicting_streams = decoder_outputs[1].view(1, model.config.ngram, 12, -1)
-        predicting_streams_logits = model.lm_head(predicting_streams)
-        next_first_stream_logits = predicting_streams_logits[:, 0]
-        #        self.assertTrue(ops.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4))
-        assert ops.allclose(next_first_stream_logits[:, :3, :3], expected_slice, atol=1e-4)
-
-    @slow
-    def test_cnndm_inference(self):
-        model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased-cnndm")
-        model.config.max_length = 512
-
-        tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased-cnndm")
-
-        ARTICLE_TO_SUMMARIZE = (
-            "USTC was founded in Beijing by the Chinese Academy of Sciences (CAS) in September 1958. The Director of"
-            " CAS, Mr. Guo Moruo was appointed the first president of USTC. USTC's founding mission was to develop a"
-            " high-level science and technology workforce, as deemed critical for development of China's economy,"
-            ' defense, and science and technology education. The establishment was hailed as "A Major Event in the'
-            ' History of Chinese Education and Science." CAS has supported USTC by combining most of its institutes'
-            " with the departments of the university. USTC is listed in the top 16 national key universities, becoming"
-            " the youngest national key university.".lower()
-        )
-        input_ids = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=511, return_tensors="ms").input_ids
-
-        summary_ids = model.generate(
-            input_ids, num_beams=4, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
-        )
-        EXPECTED_SUMMARIZE_512 = (
-            "us ##tc was founded by the chinese academy of sciences ( cas ) in 1958 . [X_SEP] us ##tc is listed in the"
-            " top 16 national key universities ."
-        )
-        generated_titles = [
-            " ".join(tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True)) for g in summary_ids
-        ]
-        self.assertListEqual(
-            [EXPECTED_SUMMARIZE_512],
-            generated_titles,
-        )
-        input_ids = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=99, return_tensors="ms").input_ids
-        # actually 98 tokens are used. max_length=100 contains bos and eos.
-        summary_ids = model.generate(
-            input_ids, num_beams=4, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
-        )
-        EXPECTED_SUMMARIZE_100 = (
-            r"us ##tc was founded in beijing by the chinese academy of sciences ( cas ) in 1958 . [X_SEP] us ##tc "
-            "'"
-            " s founding mission was to develop a high - level science and technology workforce . [X_SEP]"
-            ' establishment hailed as " a major event in the history of chinese education and science "'
-        )
-        generated_titles = [
-            " ".join(tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True)) for g in summary_ids
-        ]
-        self.assertListEqual(
-            [EXPECTED_SUMMARIZE_100],
-            generated_titles,
-        )
-
-    @slow
-    def test_question_gen_inference(self):
-        model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased-squad-qg")
-
-        tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased-squad-qg")
-
-        INPUTS = [
-            "Bill Gates [SEP] Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975.",
-            "1975 [SEP] Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975.",
-            "April 4, 1975 [SEP] Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975.",
-        ]
-
-        input_ids = tokenizer(INPUTS, truncation=True, padding=True, return_tensors="ms").input_ids
-
-        gen_output = model.generate(input_ids, num_beams=5, early_stopping=True)
-        generated_questions = tokenizer.batch_decode(gen_output, skip_special_tokens=True)
-
-        EXPECTED_QUESTIONS = [
-            "along with paul allen, who founded microsoft?",
-            "what year was microsoft founded?",
-            "when was microsoft founded?",
-        ]
-
-        self.assertListEqual(
-            EXPECTED_QUESTIONS,
-            generated_questions,
-        )
\ No newline at end of file
diff --git a/tests/transformers/models/prophetnet/test_tokenization_prophetnet.py b/tests/transformers/models/prophetnet/test_tokenization_prophetnet.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/qdqbert/__init__.py b/tests/transformers/models/qdqbert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/qdqbert/test_modeling_qdqbert.py b/tests/transformers/models/qdqbert/test_modeling_qdqbert.py
deleted file mode 100644
index f481f50e0..000000000
--- a/tests/transformers/models/qdqbert/test_modeling_qdqbert.py
+++ /dev/null
@@ -1,720 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-# Copyright 2021 NVIDIA Corporation. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch QDQBERT model."""
-
-import unittest
-import numpy as np
-from mindnlp.transformers import QDQBertConfig
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-)
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops, nn
-
-    from mindnlp.transformers import (
-        QDQBertForMaskedLM,
-        QDQBertForMultipleChoice,
-        QDQBertForNextSentencePrediction,
-        QDQBertForQuestionAnswering,
-        QDQBertForSequenceClassification,
-        QDQBertForTokenClassification,
-        QDQBertLMHeadModel,
-        QDQBertModel,
-    )
-
-
-class QDQBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        # Set default quantizers before creating the model.
-        # import pytorch_quantization.nn as quant_nn
-        # from pytorch_quantization.tensor_quant import QuantDescriptor
-
-        # The default tensor quantizer is set to use Max calibration method
-        # input_desc = QuantDescriptor(num_bits=8, calib_method="max")
-        # # The default tensor quantizer is set to be per-channel quantization for weights
-        # weight_desc = QuantDescriptor(num_bits=8, axis=((0,)))
-        # quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
-        # quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
-        # # For the test cases, since QDQBert model is tested in one run without calibration, the quantized tensors are set as fake quantized tensors which give float type tensors in the end.
-        # quant_nn.TensorQuantizer.use_fb_fake_quant = True
-
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor(
-                [self.batch_size, self.seq_length], self.type_vocab_size
-            )
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor(
-                [self.batch_size], self.type_sequence_label_size
-            )
-            token_labels = ids_tensor(
-                [self.batch_size, self.seq_length], self.num_labels
-            )
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return QDQBertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor(
-            [self.batch_size, self.seq_length, self.hidden_size]
-        )
-        encoder_attention_mask = ids_tensor(
-            [self.batch_size, self.seq_length], vocab_size=2
-        )
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = QDQBertModel(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids
-        )
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = QDQBertModel(config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(
-            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids
-        )
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = QDQBertLMHeadModel(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)
-        )
-
-    def create_and_check_for_masked_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = QDQBertForMaskedLM(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)
-        )
-
-    def create_and_check_model_for_causal_lm_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = QDQBertLMHeadModel(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)
-        )
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = QDQBertLMHeadModel(config=config)
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(
-            np.allclose(
-                output_from_past_slice.asnumpy(),
-                output_from_no_past_slice.asnumpy(),
-                atol=1e-3,
-            )
-        )
-
-    def create_and_check_for_next_sequence_prediction(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = QDQBertForNextSentencePrediction(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
-
-    def create_and_check_for_question_answering(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = QDQBertForQuestionAnswering(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(
-            result.start_logits.shape, (self.batch_size, self.seq_length)
-        )
-        self.parent.assertEqual(
-            result.end_logits.shape, (self.batch_size, self.seq_length)
-        )
-
-    def create_and_check_for_sequence_classification(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = QDQBertForSequenceClassification(config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = QDQBertForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)
-        )
-
-    def create_and_check_for_multiple_choice(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_choices = self.num_choices
-        model = QDQBertForMultipleChoice(config=config)
-        model.set_train(False)
-        multiple_choice_inputs_ids = (
-            input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        )
-        multiple_choice_token_type_ids = (
-            token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        )
-        multiple_choice_input_mask = (
-            input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        )
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_choices)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class QDQBertModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            QDQBertModel,
-            QDQBertForMaskedLM,
-            QDQBertForMultipleChoice,
-            QDQBertForNextSentencePrediction,
-            QDQBertForQuestionAnswering,
-            QDQBertForSequenceClassification,
-            QDQBertForTokenClassification,
-            QDQBertLMHeadModel,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (QDQBertLMHeadModel,) if is_mindspore_available() else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": QDQBertModel,
-            "fill-mask": QDQBertForMaskedLM,
-            "question-answering": QDQBertForQuestionAnswering,
-            "text-classification": QDQBertForSequenceClassification,
-            "text-generation": QDQBertLMHeadModel,
-            "token-classification": QDQBertForTokenClassification,
-            "zero-shot": QDQBertForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    def setUp(self):
-        self.model_tester = QDQBertModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=QDQBertConfig, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_causal_lm_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_for_causal_lm_as_decoder(
-            *config_and_inputs
-        )
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(
-            *config_and_inputs
-        )
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_next_sequence_prediction(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_next_sequence_prediction(
-            *config_and_inputs
-        )
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(
-            *config_and_inputs
-        )
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google-bert/bert-base-uncased"
-        model = QDQBertModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    # Override
-    def test_feed_forward_chunking(self):
-        # feed forward chunking is not supported in QDQBert
-        pass
-
-
-@require_mindspore
-class QDQBertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_no_head_absolute_embedding(self):
-        # Set default quantizers before creating the model.
-        # import pytorch_quantization.nn as quant_nn
-        # from pytorch_quantization.tensor_quant import QuantDescriptor
-
-        # The default tensor quantizer is set to use Max calibration method
-        # input_desc = QuantDescriptor(num_bits=8, calib_method="max")
-        # # The default tensor quantizer is set to be per-channel quantization for weights
-        # weight_desc = QuantDescriptor(num_bits=8, axis=((0,)))
-        # quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
-        # quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
-
-        model = QDQBertModel.from_pretrained("google-bert/bert-base-uncased")
-        input_ids = mindspore.tensor(
-            [[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]
-        )
-        attention_mask = mindspore.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-        output = model(input_ids, attention_mask=attention_mask)[0]
-        expected_shape = (1, 11, 768)
-        self.assertEqual(output.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [
-                [
-                    [0.4571, -0.0735, 0.8594],
-                    [0.2774, -0.0278, 0.8794],
-                    [0.3548, -0.0473, 0.7593],
-                ]
-            ]
-        )
-        self.assertTrue(
-            np.allclose(
-                output[:, 1:4, 1:4].asnumpy(), expected_slice.asnumpy(), atol=1.75e-1
-            )
-        )
diff --git a/tests/transformers/models/qwen2/__init__.py b/tests/transformers/models/qwen2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/qwen2/test_modeling_qwen2.py b/tests/transformers/models/qwen2/test_modeling_qwen2.py
deleted file mode 100644
index a4c12e8b2..000000000
--- a/tests/transformers/models/qwen2/test_modeling_qwen2.py
+++ /dev/null
@@ -1,485 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Qwen2 model."""
-
-import gc
-import tempfile
-import unittest
-
-import pytest
-
-from mindnlp.transformers import AutoTokenizer, Qwen2Config
-from mindnlp.engine import set_seed
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, nn, no_grad
-
-    from mindnlp.transformers import (
-        Qwen2ForCausalLM,
-        Qwen2ForSequenceClassification,
-        Qwen2ForTokenClassification,
-        Qwen2Model,
-    )
-
-
-class Qwen2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=5,
-        max_window_layers=3,
-        use_sliding_window=True,
-        sliding_window=2,
-        num_attention_heads=4,
-        num_key_value_heads=2,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        bos_token_id=1,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.max_window_layers = max_window_layers
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.scope = scope
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ops.tril(ops.ones(self.batch_size, self.seq_length))
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return Qwen2Config(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            max_window_layers=self.max_window_layers,
-            use_sliding_window=self.use_sliding_window,
-            sliding_window=self.sliding_window,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_key_value_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-            bos_token_id=self.bos_token_id,
-        )
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Qwen2
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Qwen2Model(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Qwen2
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Qwen2Model(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Qwen2
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Qwen2ForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Qwen2
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Qwen2ForCausalLM(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-# Copied from tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Qwen2
-class Qwen2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (Qwen2Model, Qwen2ForCausalLM, Qwen2ForSequenceClassification, Qwen2ForTokenClassification)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (Qwen2ForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": Qwen2Model,
-            "text-classification": Qwen2ForSequenceClassification,
-            "token-classification": Qwen2ForTokenClassification,
-            "text-generation": Qwen2ForCausalLM,
-            "zero-shot": Qwen2ForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-    fx_compatible = True
-
-    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return True
-
-    def setUp(self):
-        self.model_tester = Qwen2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Qwen2Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_Qwen2_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        print(config)
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = Qwen2ForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Qwen2_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = Qwen2ForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Qwen2_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(mindspore.float32)
-        model = Qwen2ForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->Qwen2,llama->Qwen2
-    def test_Qwen2_token_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
-        model = Qwen2ForTokenClassification(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
-        self.assertEqual(
-            result.logits.shape,
-            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
-        )
-
-    @unittest.skip(reason="Qwen2 buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Qwen2 uses GQA on all models so the KV cache is a non standard format")
-    def test_past_key_values_format(self):
-        pass
-
-
-@require_mindspore
-class Qwen2IntegrationTest(unittest.TestCase):
-    @slow
-    def test_model_450m_logits(self):
-        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
-        model = Qwen2ForCausalLM.from_pretrained("Qwen/Qwen2-450m-beta")
-        input_ids = mindspore.tensor([input_ids])
-        with no_grad():
-            out = model(input_ids).logits
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = mindspore.tensor([[-2.5548, -2.5737, -3.0600, -2.5906, -2.8478, -2.8118, -2.9325, -2.7694]])
-        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
-        # slicing logits[0, 0, 0:30]
-        EXPECTED_SLICE = mindspore.tensor([-5.8781, -5.8616, -0.1052, -4.7200, -5.8781, -5.8774, -5.8773, -5.8777, -5.8781, -5.8780, -5.8781, -5.8779, -1.0787,  1.7583, -5.8779, -5.8780, -5.8783, -5.8778, -5.8776, -5.8781, -5.8784, -5.8778, -5.8778, -5.8777, -5.8779, -5.8778, -5.8776, -5.8780, -5.8779, -5.8781])  # fmt: skip
-        print(out[0, 0, :30])
-        torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-4, rtol=1e-4)
-
-        del model
-        gc.collect()
-
-    @slow
-    def test_model_450m_generation(self):
-        EXPECTED_TEXT_COMPLETION = """My favourite condiment is 100% ketchup. I love it on everything. I’m not a big"""
-        prompt = "My favourite condiment is "
-        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-450m-beta", use_fast=False)
-        model = Qwen2ForCausalLM.from_pretrained("Qwen/Qwen2-450m-beta")
-        input_ids = tokenizer.encode(prompt, return_tensors="ms")
-
-        # greedy generation outputs
-        generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0)
-        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
-
-        del model
-        gc.collect()
-
-    @slow
-    @pytest.mark.flash_attn_test
-    def test_model_450m_long_prompt(self):
-        EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
-        # An input with 4097 tokens that is above the size of the sliding window
-        input_ids = [1] + [306, 338] * 2048
-        model = Qwen2ForCausalLM.from_pretrained(
-            "Qwen/Qwen2-450m-beta",
-            # load_in_4bit=True,
-            # attn_implementation="flash_attention_2",
-        )
-        input_ids = mindspore.tensor([input_ids])
-        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
-        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
-
-        # Assisted generation
-        assistant_model = model
-        assistant_model.generation_config.num_assistant_tokens = 2
-        assistant_model.generation_config.num_assistant_tokens_schedule = "constant"
-        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
-        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
-
-        del assistant_model
-        del model
-        gc.collect()
-
-    @slow
-    def test_speculative_generation(self):
-        EXPECTED_TEXT_COMPLETION = (
-            "My favourite condiment is 100% Sriracha. I love the heat, the tang and the fact costs"
-        )
-        prompt = "My favourite condiment is "
-        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-beta", use_fast=False)
-        model = Qwen2ForCausalLM.from_pretrained("Qwen/Qwen2-450m-beta", ms_dtype=mindspore.float16)
-        assistant_model = Qwen2ForCausalLM.from_pretrained(
-            "Qwen/Qwen2-450m-beta", ms_dtype=mindspore.float16
-        )
-        input_ids = tokenizer.encode(prompt, return_tensors="ms")
-
-        # greedy generation outputs
-        set_seed(0)
-        generated_ids = model.generate(
-            input_ids, max_new_tokens=20, do_sample=True, temperature=0.3, assistant_model=assistant_model
-        )
-        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
-
-        del model
-        gc.collect()
\ No newline at end of file
diff --git a/tests/transformers/models/qwen2_moe/__init__.py b/tests/transformers/models/qwen2_moe/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/transformers/models/qwen2_moe/test_modeling_qwen2_moe.py
deleted file mode 100644
index 96fff4e60..000000000
--- a/tests/transformers/models/qwen2_moe/test_modeling_qwen2_moe.py
+++ /dev/null
@@ -1,566 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Qwen2MoE model."""
-
-import gc
-import tempfile
-import unittest
-
-import pytest
-
-from mindnlp.transformers import AutoTokenizer, Qwen2MoeConfig
-from mindnlp.engine import set_seed
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_mindspore_gpu,
-    slow,
-    is_mindspore_available,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        Qwen2MoeForCausalLM,
-        Qwen2MoeForSequenceClassification,
-        Qwen2MoeForTokenClassification,
-        Qwen2MoeModel,
-    )
-
-
-class Qwen2MoeModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=5,
-        max_window_layers=3,
-        use_sliding_window=True,
-        sliding_window=2,
-        num_attention_heads=4,
-        num_key_value_heads=2,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        expert_interval=1,
-        moe_intermediate_size=12,
-        shared_expert_intermediate_size=36,
-        shared_expert_gate=True,
-        num_experts_per_tok=2,
-        num_experts=8,
-        norm_topk_prob=False,
-        output_router_logits=False,
-        router_aux_loss_coef=0.001,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        bos_token_id=1,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.max_window_layers = max_window_layers
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.scope = scope
-        self.expert_interval = expert_interval
-        self.moe_intermediate_size = moe_intermediate_size
-        self.shared_expert_intermediate_size = shared_expert_intermediate_size
-        self.shared_expert_gate = shared_expert_gate
-        self.num_experts_per_tok = num_experts_per_tok
-        self.num_experts = num_experts
-        self.norm_topk_prob = norm_topk_prob
-        self.output_router_logits = output_router_logits
-        self.router_aux_loss_coef = router_aux_loss_coef
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ops.tril(ops.ones(self.batch_size, self.seq_length))
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return Qwen2MoeConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            max_window_layers=self.max_window_layers,
-            use_sliding_window=self.use_sliding_window,
-            sliding_window=self.sliding_window,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_key_value_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            expert_interval=self.expert_interval,
-            moe_intermediate_size=self.moe_intermediate_size,
-            shared_expert_intermediate_size=self.shared_expert_intermediate_size,
-            shared_expert_gate=self.shared_expert_gate,
-            num_experts_per_tok=self.num_experts_per_tok,
-            num_experts=self.num_experts,
-            norm_topk_prob=self.norm_topk_prob,
-            output_router_logits=self.output_router_logits,
-            router_aux_loss_coef=self.router_aux_loss_coef,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-            bos_token_id=self.bos_token_id,
-        )
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Qwen2Moe
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Qwen2MoeModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Qwen2Moe
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Qwen2MoeModel(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Qwen2Moe
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Qwen2MoeForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Qwen2Moe
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Qwen2MoeForCausalLM(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-# Copied from tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Qwen2Moe
-class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (Qwen2MoeModel, Qwen2MoeForCausalLM, Qwen2MoeForSequenceClassification, Qwen2MoeForTokenClassification)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (Qwen2MoeForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": Qwen2MoeModel,
-            "text-classification": Qwen2MoeForSequenceClassification,
-            "token-classification": Qwen2MoeForTokenClassification,
-            "text-generation": Qwen2MoeForCausalLM,
-            "zero-shot": Qwen2MoeForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-    fx_compatible = True
-
-    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return True
-
-    def setUp(self):
-        self.model_tester = Qwen2MoeModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Qwen2MoeConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_Qwen2Moe_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        print(config)
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = Qwen2MoeForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Qwen2Moe_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = Qwen2MoeForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Qwen2Moe_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(mindspore.float32)
-        model = Qwen2MoeForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->Qwen2Moe,llama->Qwen2Moe
-    def test_Qwen2Moe_token_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
-        model = Qwen2MoeForTokenClassification(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
-        self.assertEqual(
-            result.logits.shape,
-            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
-        )
-
-    @unittest.skip(reason="Qwen2Moe buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Qwen2Moe uses GQA on all models so the KV cache is a non standard format")
-    def test_past_key_values_format(self):
-        pass
-
-    # Ignore copy
-    def test_load_balancing_loss(self):
-        r"""
-        Let's make sure we can actually compute the loss and do a backward on it.
-        """
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.num_experts = 8
-        config.expert_interval = 2
-        config.output_router_logits = True
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        model = Qwen2MoeForCausalLM(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask)
-        self.assertEqual(result.router_logits[0].shape, (91, config.num_experts))
-        assert ops.allclose(result.aux_loss, mindspore.tensor(2, dtype=mindspore.float32), rtol=1e-2, atol=1e-2)
-
-        # First, we make sure that adding padding tokens doesn't change the loss
-        # loss(input_ids, attention_mask=None) == loss(input_ids + padding, attention_mask=attention_mask_with_padding)
-        pad_length = 1000
-        # Add padding tokens (assume that pad_token_id=1) to input_ids
-        padding_block = ops.ones(input_ids.shape[0], pad_length, dtype=mindspore.int64)
-        padded_input_ids = ops.cat((padding_block, input_ids), dim=1)  # this is to simulate padding to the left
-        padded_attention_mask = padded_input_ids.ne(1)
-
-        padded_result = model(padded_input_ids, attention_mask=padded_attention_mask)
-        assert ops.allclose(result.aux_loss, padded_result.aux_loss, rtol=1e-4, atol=1e-4)
-
-        # We make sure that the loss of includding padding tokens != the loss without padding tokens
-        # if attention_mask=None --> we don't exclude padding tokens
-        include_padding_result = model(padded_input_ids, attention_mask=None)
-
-        # This is to mimic torch.testing.assert_not_close
-        self.assertNotAlmostEqual(include_padding_result.aux_loss.item(), result.aux_loss.item())
-
-
-@require_mindspore
-class Qwen2MoeIntegrationTest(unittest.TestCase):
-    @slow
-    def test_model_a2_7b_logits(self):
-        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
-        model = Qwen2MoeForCausalLM.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", mirror='modelscope')
-        input_ids = mindspore.tensor([input_ids])
-        with no_grad():
-            out = model(input_ids).logits
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = mindspore.tensor([[-4.2125, -3.6416, -4.9136, -4.3005, -4.9938, -3.4393, -3.5195, -4.1621]])
-        assert ops.allclose(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
-        # slicing logits[0, 0, 0:30]
-        EXPECTED_SLICE = mindspore.tensor([2.3013, -0.6595, -0.1389, -1.4095, -1.7381, -1.7609, -2.0449, -2.4289, -3.0271, -2.1351, -0.6568, -4.6012, -1.9102, -0.7475, -3.1377, 4.6904, 7.1936, 7.0991, 6.4414, 6.1720, 6.2617, 5.8751, 5.6997, 5.6011, 5.5828, -3.9505, -0.5384, -0.3392, 1.2445, 2.0714])  # fmt: skip
-        print(out[0, 0, :30])
-        assert ops.allclose(out[0, 0, :30], EXPECTED_SLICE, atol=1e-4, rtol=1e-4)
-
-        del model
-        gc.collect()
-
-    @slow
-    def test_model_a2_7b_generation(self):
-        EXPECTED_TEXT_COMPLETION = """To be or not to be, that is the question. This is the question that has been asked by many people over the"""
-        prompt = "To be or not to"
-        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", use_fast=False, mirror='modelscope')
-        model = Qwen2MoeForCausalLM.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", mirror='modelscope')
-        input_ids = tokenizer.encode(prompt, return_tensors="ms")
-
-        # greedy generation outputs
-        generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0)
-        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
-
-        del model
-        gc.collect()
-
-    # @require_bitsandbytes
-    # @slow
-    # @require_flash_attn
-    # @pytest.mark.flash_attn_test
-    # def test_model_a2_7b_long_prompt(self):
-    #     EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
-    #     # An input with 4097 tokens that is above the size of the sliding window
-    #     input_ids = [1] + [306, 338] * 2048
-    #     model = Qwen2MoeForCausalLM.from_pretrained(
-    #         "Qwen/Qwen1.5-MoE-A2.7B",
-    #         device_map="auto",
-    #         load_in_4bit=True,
-    #         attn_implementation="flash_attention_2",
-    #     )
-    #     input_ids = mindspore.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
-    #     generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
-    #     self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
-
-    #     # Assisted generation
-    #     assistant_model = model
-    #     assistant_model.generation_config.num_assistant_tokens = 2
-    #     assistant_model.generation_config.num_assistant_tokens_schedule = "constant"
-    #     generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
-    #     self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
-
-    #     del assistant_model
-    #     del model
-    #     gc.collect()
-
-
-    @slow
-    def test_speculative_generation(self):
-        EXPECTED_TEXT_COMPLETION = (
-            "To be or not to be, that is the question.\nThe answer is to be, of course. But what does it"
-        )
-        prompt = "To be or not to"
-        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", use_fast=False, mirror='modelscope')
-        model = Qwen2MoeForCausalLM.from_pretrained(
-            "Qwen/Qwen1.5-MoE-A2.7B", ms_dtype=mindspore.float16, mirror='modelscope'
-        )
-        assistant_model = Qwen2MoeForCausalLM.from_pretrained(
-            "Qwen/Qwen1.5-MoE-A2.7B", ms_dtype=mindspore.float16, mirror='modelscope'
-        )
-        input_ids = tokenizer.encode(prompt, return_tensors="ms")
-
-        # greedy generation outputs
-        set_seed(0)
-        generated_ids = model.generate(
-            input_ids, max_new_tokens=20, do_sample=True, temperature=0.3, assistant_model=assistant_model
-        )
-        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
-
-        del model
-        gc.collect()
-
-    @slow
-    def test_model_a2_7b_generation_time(self):
-        EXPECTED_TEXT_COMPLETION = """To be or not to be, that is the question. This is the question that has been asked by many people over the"""
-        prompt = "To be or not to"
-        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", use_fast=False, mirror='modelscope')
-        model = Qwen2MoeForCausalLM.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", mirror='modelscope')
-        input_ids = tokenizer.encode(prompt, return_tensors="ms")
-
-        # greedy generation outputs
-        generated_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False)
diff --git a/tests/transformers/models/qwen2_vl/__init__.py b/tests/transformers/models/qwen2_vl/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/transformers/models/qwen2_vl/test_modeling_qwen2_vl.py
deleted file mode 100644
index b1f89705d..000000000
--- a/tests/transformers/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ /dev/null
@@ -1,432 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Qwen2-VL model."""
-
-import gc
-import unittest
-
-import requests
-
-from mindnlp.transformers import (
-    AutoProcessor,
-    Qwen2VLConfig,
-    Qwen2VLForConditionalGeneration
-)
-from mindnlp.utils import (
-    is_mindspore_available,
-    is_vision_available,
-)
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-)
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-if is_vision_available():
-    from PIL import Image
-
-
-class Qwen2VLVisionText2TextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        seq_length=7,
-        num_channels=3,
-        ignore_index=-100,
-        image_size=14,
-        bos_token_id=0,
-        eos_token_id=1,
-        pad_token_id=2,
-        vision_start_token_id=151652,
-        image_token_id=151655,
-        video_token_id=151656,
-        hidden_act="silu",
-        hidden_size=32,
-        vocab_size=152064,
-        intermediate_size=37,
-        max_position_embeddings=512,
-        max_window_layers=3,
-        model_type="qwen2_vl",
-        num_attention_heads=4,
-        num_hidden_layers=4,
-        num_key_value_heads=2,
-        rope_theta=10000,
-        tie_word_embeddings=True,
-        is_training=True,
-        vision_config={
-            "depth": 2,
-            "embed_dim": 32,
-            "hidden_act": "quick_gelu",
-            "hidden_size": 32,
-            "mlp_ratio": 4,
-            "num_heads": 4,
-            "patch_size": 14,
-            "spatial_merge_size": 1,
-            "temporal_patch_size": 2,
-        },
-        rope_scaling={"type": "mrope", "mrope_section": [2, 1, 1]},
-    ):
-        self.parent = parent
-        self.ignore_index = ignore_index
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.vision_start_token_id = vision_start_token_id
-        self.image_token_id = image_token_id
-        self.video_token_id = video_token_id
-        self.hidden_act = hidden_act
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.max_position_embeddings = max_position_embeddings
-        self.max_window_layers = max_window_layers
-        self.model_type = model_type
-        self.num_attention_heads = num_attention_heads
-        self.num_hidden_layers = num_hidden_layers
-        self.num_key_value_heads = num_key_value_heads
-        self.rope_theta = rope_theta
-        self.tie_word_embeddings = tie_word_embeddings
-        self.vision_config = vision_config
-        self.rope_scaling = rope_scaling
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.is_training = is_training
-        self.vocab_size = vocab_size
-        self.num_image_tokens = 32
-        self.seq_length = seq_length + self.num_image_tokens
-
-    def get_config(self):
-        return Qwen2VLConfig(
-            hidden_size=self.hidden_size,
-            intermediate_size=self.intermediate_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_key_value_heads,
-            hidden_act=self.hidden_act,
-            max_position_embeddings=self.max_position_embeddings,
-            vision_config=self.vision_config,
-            model_type=self.model_type,
-            max_window_layers=self.max_window_layers,
-            rope_scaling=self.rope_scaling,
-            tie_word_embeddings=self.tie_word_embeddings,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            vision_start_token_id=self.vision_start_token_id,
-            image_token_id=self.image_token_id,
-            video_token_id=self.video_token_id,
-            vocab_size=self.vocab_size,
-        )
-
-    def prepare_config_and_inputs(self):
-        config = self.get_config()
-        patch_size = config.vision_config.patch_size
-        temporal_patch_size = config.vision_config.temporal_patch_size
-        pixel_values = floats_tensor(
-            [
-                self.batch_size * (self.image_size**2) // (patch_size**2),
-                self.num_channels * (patch_size**2) * temporal_patch_size,
-            ]
-        )
-
-        return config, pixel_values
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        attention_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        input_ids[input_ids == self.image_token_id] = self.pad_token_id
-        input_ids[:, self.num_image_tokens] = self.image_token_id
-        labels = ops.zeros(
-            (self.batch_size, self.seq_length),
-            dtype=mindspore.int64,
-        )
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "image_grid_thw": mindspore.tensor([[1, 1, 1]] * self.batch_size),
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "labels": labels,
-        }
-        return config, inputs_dict
-
-    def create_and_check_qwen2_vl_model_fp16_forward(
-        self, config, input_ids, pixel_values, attention_mask, image_grid_thw
-    ):
-        model = Qwen2VLForConditionalGeneration(config=config)
-        model.half()
-        model.eval()
-        logits = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            image_grid_thw=image_grid_thw,
-            pixel_values=pixel_values.to(mindspore.bfloat16),
-            return_dict=True,
-        )["logits"]
-        self.parent.assertFalse(ops.isnan(logits).any().item())
-
-    def create_and_check_qwen2_vl_model_fp16_autocast_forward(
-        self, config, input_ids, pixel_values, attention_mask, image_grid_thw
-    ):
-        config.ms_dtype = mindspore.float16
-        model = Qwen2VLForConditionalGeneration(config=config)
-        model.eval()
-        # with torch.autocast(device_type="cuda", dtype=mindspore.float16):
-        logits = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            image_grid_thw=image_grid_thw,
-            pixel_values=pixel_values.to(mindspore.bfloat16),
-            return_dict=True,
-        )["logits"]
-        self.parent.assertFalse(ops.isnan(logits).any().item())
-
-
-@require_mindspore
-class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    """
-    Model tester for `Qwen2VLForConditionalGeneration`.
-    """
-
-    all_model_classes = (Qwen2VLForConditionalGeneration,) if is_mindspore_available() else ()
-    all_generative_model_classes = (Qwen2VLForConditionalGeneration,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = Qwen2VLVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Qwen2VLConfig, has_text_modality=False)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="Feedforward chunking is not yet supported")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip(reason="Generate needs input ids")
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        pass
-
-    @unittest.skip(reason="CPU offload is not yet supported")
-    def test_cpu_offload(self):
-        pass
-
-    @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
-    def test_disk_offload_bin(self):
-        pass
-
-    @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
-    def test_disk_offload_safetensors(self):
-        pass
-
-    @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.")
-    def test_model_parallelism(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported because in Qwen2VL models")
-    def test_sdpa_can_compile_dynamic(self):
-        pass
-
-    @unittest.skip(reason="Compile not yet supported because in Qwen2VL models")
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
-    @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-    @unittest.skip(reason="We cannot configure to output a smaller model.")
-    def test_model_is_small(self):
-        pass
-
-    @unittest.skip(
-        reason="Qwen2-VL can't do low-memory generation because position IDs have extra dimension and split function doesn't work for that"
-    )
-    def test_beam_search_low_memory(self):
-        pass
-
-    @unittest.skip(
-        reason="VLMs can't generate from inputs embeds and pixels. This can be tested as part of bacbone LM, no need to run the tes for VLMs"
-    )
-    def test_generate_from_inputs_embeds_with_static_cache(self):
-        pass
-
-
-@require_mindspore
-class Qwen2VLIntegrationTest(unittest.TestCase):
-    def setUp(self):
-        self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
-        self.messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image"},
-                    {"type": "text", "text": "What kind of dog is this?"},
-                ],
-            }
-        ]
-        url = "https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen2-VL/demo_small.jpg"
-        self.image = Image.open(requests.get(url, stream=True).raw)
-
-    def tearDown(self):
-        gc.collect()
-        # torch.cuda.empty_cache()
-
-    @slow
-    def test_small_model_integration_test(self):
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            "Qwen/Qwen2-VL-7B-Instruct", ms_dtype="auto", device_map="auto"
-        )
-
-        text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
-        inputs = self.processor(text=[text], images=[self.image], return_tensors="ms")
-
-        expected_input_ids = [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 151652, 151655, 151655]  # fmt: skip
-        assert expected_input_ids == inputs.input_ids[0].tolist()[:17]
-
-        expected_pixel_slice = mindspore.tensor(
-            [
-                [0.8792, 0.8792, 0.9084],
-                [1.1858, 1.1858, 1.2296],
-                [1.2004, 1.2004, 1.2150],
-                [1.4340, 1.4340, 1.4194],
-                [1.3902, 1.4048, 1.4194],
-                [1.5216, 1.5362, 1.5362],
-            ],
-            dtype=mindspore.float32,
-        )
-        assert ops.allclose(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=3e-3)
-
-
-        output = model.generate(**inputs, max_new_tokens=30)
-        EXPECTED_DECODED_TEXT = "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets"
-
-        self.assertEqual(
-            self.processor.decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    def test_small_model_integration_test_batch(self):
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            "Qwen/Qwen2-VL-7B-Instruct", ms_dtype="auto", device_map="auto"
-        )
-        text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
-        inputs = self.processor(text=[text, text], images=[self.image, self.image], return_tensors="ms")
-
-        # it should not matter whether two images are the same size or not
-        output = model.generate(**inputs, max_new_tokens=30)
-
-        EXPECTED_DECODED_TEXT = [
-            'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular choices',
-            'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets'
-        ]  # fmt: skip
-        self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    def test_small_model_integration_test_batch_wo_image(self):
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            "Qwen/Qwen2-VL-7B-Instruct", ms_dtype="auto", device_map="auto"
-        )
-        text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
-        messages2 = [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "Who are you?"},
-        ]
-        text2 = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
-        inputs = self.processor(text=[text, text2], images=[self.image], padding=True, return_tensors="ms")
-
-        # it should not matter whether two images are the same size or not
-        output = model.generate(**inputs, max_new_tokens=30)
-
-        EXPECTED_DECODED_TEXT = [
-            'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets',
-            'system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to assist with various tasks and answer questions to the best of my'
-        ]  # fmt: skip
-        self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    def test_small_model_integration_test_batch_different_resolutions(self):
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            "Qwen/Qwen2-VL-7B-Instruct", ms_dtype="auto", device_map="auto"
-        )
-        text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
-        text2 = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
-        image2 = self.image.resize((224, 224))
-        inputs = self.processor(text=[text, text2], images=[self.image, image2], padding=True, return_tensors="ms")
-
-        # it should not matter whether two images are the same size or not
-        output = model.generate(**inputs, max_new_tokens=30)
-
-        EXPECTED_DECODED_TEXT = [
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
-            "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and intelligent nature, making them popular pets",
-        ]
-        self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
diff --git a/tests/transformers/models/rag/__init__.py b/tests/transformers/models/rag/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/rag/test_modeling_rag.py b/tests/transformers/models/rag/test_modeling_rag.py
deleted file mode 100644
index 9beb343a0..000000000
--- a/tests/transformers/models/rag/test_modeling_rag.py
+++ /dev/null
@@ -1,1191 +0,0 @@
-# coding=utf-8
-# Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import gc
-import json
-import os
-import shutil
-import tempfile
-import unittest
-from unittest.mock import patch
-
-import numpy as np
-
-
-from mindnlp.transformers import BartTokenizer, T5Tokenizer, AutoModelForSeq2SeqLM, AutoModel
-from mindnlp.transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
-from mindnlp.transformers.models.dpr.tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer
-from mindnlp.transformers.models.rag.retrieval_rag import is_datasets_available, is_faiss_available
-from mindnlp.transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
-from mindnlp.utils import cached_property, is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    get_tests_dir,
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    require_mindspore_gpu,
-    slow,
-)
-from ..bart.test_modeling_bart import BartModelTester
-from ..dpr.test_modeling_dpr import DPRModelTester
-from ..t5.test_modeling_t5 import T5ModelTester
-
-os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
-
-TOLERANCE = 1e-3
-
-T5_SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
-if is_mindspore_available() and is_datasets_available() and is_faiss_available():
-    import faiss
-    import mindspore
-    from mindnlp.core import ops
-    from datasets import Dataset
-
-    from mindnlp.transformers import (
-        AutoConfig,
-        DPRContextEncoder,
-        RagConfig,
-        RagModel,
-        RagRetriever,
-        RagSequenceForGeneration,
-        RagTokenForGeneration,
-        RagTokenizer,
-    )
-    from mindnlp.transformers.modeling_outputs import BaseModelOutput
-
-def _assert_tensors_equal(a_, b_, atol=1e-12, prefix=""):
-    """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
-    if a_ is None and b_ is None:
-        return True
-    try:
-        a = a_.asnumpy()
-        b = b_.asnumpy()
-        if np.allclose(a, b, atol=atol):
-            return True
-        raise
-    except Exception:
-        msg = f"{a} != {b}"
-        if prefix:
-            msg = prefix + ": " + msg
-        raise AssertionError(msg)
-
-
-def require_retrieval(test_case):
-    """
-    Decorator marking a test that requires a set of dependencies necessary for pefrorm retrieval with
-    [`RagRetriever`].
-
-    These tests are skipped when respective libraries are not installed.
-
-    """
-    if not (is_mindspore_available() and is_datasets_available() and is_faiss_available()):
-        test_case = unittest.skip(reason="test requires PyTorch, datasets and faiss")(test_case)
-    return test_case
-
-
-@require_mindspore
-@require_retrieval
-@require_sentencepiece
-class RagTestMixin:
-    all_model_classes = (
-        (RagModel, RagTokenForGeneration, RagSequenceForGeneration)
-        if is_mindspore_available() and is_datasets_available() and is_faiss_available()
-        else ()
-    )
-
-    retrieval_vector_size = 32
-    n_docs = 3
-    max_combined_length = 16
-
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-
-        # DPR tok
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        dpr_tokenizer_path = os.path.join(self.tmpdirname, "dpr_tokenizer")
-        os.makedirs(dpr_tokenizer_path, exist_ok=True)
-        self.vocab_file = os.path.join(dpr_tokenizer_path, DPR_VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-        # BART tok
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "\u0120",
-            "\u0120l",
-            "\u0120n",
-            "\u0120lo",
-            "\u0120low",
-            "er",
-            "\u0120lowest",
-            "\u0120newer",
-            "\u0120wider",
-            "<unk>",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        bart_tokenizer_path = os.path.join(self.tmpdirname, "bart_tokenizer")
-        os.makedirs(bart_tokenizer_path, exist_ok=True)
-        self.vocab_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-        t5_tokenizer = T5Tokenizer(T5_SAMPLE_VOCAB)
-        t5_tokenizer_path = os.path.join(self.tmpdirname, "t5_tokenizer")
-        t5_tokenizer.save_pretrained(t5_tokenizer_path)
-
-    @cached_property
-    def dpr_tokenizer(self) -> DPRQuestionEncoderTokenizer:
-        return DPRQuestionEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
-
-    @cached_property
-    def dpr_ctx_encoder_tokenizer(self) -> DPRContextEncoderTokenizer:
-        return DPRContextEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
-
-    @cached_property
-    def bart_tokenizer(self) -> BartTokenizer:
-        return BartTokenizer.from_pretrained(os.path.join(self.tmpdirname, "bart_tokenizer"))
-
-    @cached_property
-    def t5_tokenizer(self) -> BartTokenizer:
-        return T5Tokenizer.from_pretrained(os.path.join(self.tmpdirname, "t5_tokenizer"))
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-
-    def get_retriever(self, config):
-        dataset = Dataset.from_dict(
-            {
-                "id": ["0", "1", "3"],
-                "text": ["foo", "bar", "qux"],
-                "title": ["Foo", "Bar", "Qux"],
-                "embeddings": [
-                    np.ones(self.retrieval_vector_size),
-                    2 * np.ones(self.retrieval_vector_size),
-                    3 * np.ones(self.retrieval_vector_size),
-                ],
-            }
-        )
-        dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT)
-        tokenizer = self.bart_tokenizer if config.generator.model_type == "bart" else self.t5_tokenizer
-        with patch("mindnlp.transformers.models.rag.retrieval_rag.load_dataset") as mock_load_dataset:
-            mock_load_dataset.return_value = dataset
-            retriever = RagRetriever(
-                config,
-                question_encoder_tokenizer=self.dpr_tokenizer,
-                generator_tokenizer=tokenizer,
-            )
-        return retriever
-
-    def check_model_with_retriever(
-            self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config, retriever=self.get_retriever(config))
-            model.eval()
-
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            outputs = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            # logits
-            self.assertEqual(
-                outputs.logits.shape,
-                (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
-            )
-            # generator encoder last hidden states
-            self.assertEqual(
-                outputs.generator_enc_last_hidden_state.shape,
-                (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
-            )
-            # doc scores
-            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
-
-    def check_model_with_end2end_retriever(
-            self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        context_encoder_tokenizer = self.dpr_ctx_encoder_tokenizer
-        dpr_context_encoder = DPRContextEncoder(config.question_encoder)  # dpr is a twin tower
-
-        retriever = self.get_retriever(config)
-        retriever.set_ctx_encoder_tokenizer(context_encoder_tokenizer)  # setting the ctx_encoder_tokenizer.
-
-        for model_class in [RagTokenForGeneration, RagSequenceForGeneration]:
-            model = model_class(config, retriever=retriever)
-            model.set_context_encoder_for_training(dpr_context_encoder)  # set the context_encoder for training
-            model.eval()
-
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            outputs = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            # logits
-            self.assertEqual(
-                outputs.logits.shape,
-                (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
-            )
-            # generator encoder last hidden states
-            self.assertEqual(
-                outputs.generator_enc_last_hidden_state.shape,
-                (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
-            )
-            # doc scores
-            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
-
-    def check_model_generate_from_context_input_ids(
-            self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        retriever = self.get_retriever(config)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
-
-            out = retriever(
-                input_ids,
-                question_hidden_states.to(mindspore.float32).numpy(),
-                prefix=config.generator.prefix,
-                return_tensors="ms",
-            )
-
-            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
-                out["context_input_ids"],
-                out["context_attention_mask"],
-                out["retrieved_doc_embeds"],
-            )
-
-            # cast
-            retrieved_doc_embeds = retrieved_doc_embeds.to(question_hidden_states.dtype)
-            context_input_ids = context_input_ids.to(input_ids.dtype)
-            context_attention_mask = context_attention_mask.to(input_ids.dtype)
-
-            # compute doc_scores
-            doc_scores = ops.bmm(question_hidden_states.unsqueeze(1), ops.transpose(retrieved_doc_embeds,1, 2)).squeeze(
-                1
-            )
-
-            outputs = model.generate(
-                context_input_ids=context_input_ids,
-                context_attention_mask=context_attention_mask,
-                doc_scores=doc_scores,
-                do_deduplication=True,
-            )
-
-            self.assertIsNotNone(outputs)
-
-    def check_model_generate(
-            self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        for model_class in self.all_model_classes[1:]:
-            model = model_class(config, retriever=self.get_retriever(config))
-            model.eval()
-
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            outputs = model.generate(
-                input_ids=input_ids,
-                num_beams=2,
-                num_return_sequences=2,
-                decoder_start_token_id=config.generator.eos_token_id,
-            )
-
-            self.assertIsNotNone(outputs)
-
-    def check_model_without_retriever(
-            self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        retriever = self.get_retriever(config)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
-
-            out = retriever(
-                input_ids,
-                question_hidden_states.to(mindspore.float32).numpy(),
-                prefix=config.generator.prefix,
-                return_tensors="ms",
-            )
-
-            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
-                out["context_input_ids"],
-                out["context_attention_mask"],
-                out["retrieved_doc_embeds"],
-            )
-
-            # cast
-            retrieved_doc_embeds = retrieved_doc_embeds.to(question_hidden_states.dtype)
-            context_input_ids = context_input_ids.to(input_ids.dtype)
-            context_attention_mask = context_attention_mask.to(input_ids.dtype)
-
-            # compute doc_scores
-            doc_scores = ops.bmm(question_hidden_states.unsqueeze(1), ops.transpose(retrieved_doc_embeds,1, 2)).squeeze(
-                1
-            )
-
-            outputs = model(
-                context_input_ids=context_input_ids,
-                context_attention_mask=context_attention_mask,
-                doc_scores=doc_scores,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            # logits
-            self.assertEqual(
-                outputs.logits.shape,
-                (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
-            )
-            # generator encoder last hidden states
-            self.assertEqual(
-                outputs.generator_enc_last_hidden_state.shape,
-                (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
-            )
-            # doc scores
-            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
-
-    def check_model_custom_n_docs(
-            self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, n_docs, **kwargs
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        retriever = self.get_retriever(config)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
-
-            out = retriever(
-                input_ids,
-                question_hidden_states.to(mindspore.float32).numpy(),
-                prefix=config.generator.prefix,
-                return_tensors="ms",
-                n_docs=n_docs,
-            )
-
-            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
-                out["context_input_ids"],
-                out["context_attention_mask"],
-                out["retrieved_doc_embeds"],
-            )
-
-            # cast
-            retrieved_doc_embeds = retrieved_doc_embeds.to(question_hidden_states.dtype)
-            context_input_ids = context_input_ids.to(input_ids.dtype)
-            context_attention_mask = context_attention_mask.to(input_ids.dtype)
-
-            # compute doc_scores
-            doc_scores = ops.bmm(question_hidden_states.unsqueeze(1), ops.transpose(retrieved_doc_embeds,1, 2)).squeeze(
-                1
-            )
-
-            outputs = model(
-                context_input_ids=context_input_ids,
-                context_attention_mask=context_attention_mask,
-                doc_scores=doc_scores,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-                n_docs=n_docs,
-            )
-
-            # logits
-            self.assertEqual(
-                outputs.logits.shape,
-                (n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
-            )
-            # generator encoder last hidden states
-            self.assertEqual(
-                outputs.generator_enc_last_hidden_state.shape,
-                (n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
-            )
-            # doc scores
-            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], n_docs))
-
-    def check_model_with_mismatch_n_docs_value(
-            self,
-            config,
-            input_ids,
-            attention_mask,
-            decoder_input_ids,
-            decoder_attention_mask,
-            retriever_n_docs,
-            generator_n_docs,
-            **kwargs,
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        retriever = self.get_retriever(config)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            question_hidden_states = model.question_encoder(input_ids, attention_mask=attention_mask)[0]
-
-            out = retriever(
-                input_ids,
-                question_hidden_states.to(mindspore.float32).numpy(),
-                prefix=config.generator.prefix,
-                return_tensors="ms",
-                n_docs=retriever_n_docs,
-            )
-
-            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
-                out["context_input_ids"],
-                out["context_attention_mask"],
-                out["retrieved_doc_embeds"],
-            )
-
-            # cast
-            retrieved_doc_embeds = retrieved_doc_embeds.to(question_hidden_states.dtype)
-            context_input_ids = context_input_ids.to(input_ids.dtype)
-            context_attention_mask = context_attention_mask.to(input_ids.dtype)
-
-            # compute doc_scores
-            doc_scores = ops.bmm(question_hidden_states.unsqueeze(1), ops.transpose(retrieved_doc_embeds,1, 2)).squeeze(
-                1
-            )
-
-            self.assertRaises(
-                AssertionError,
-                model.__call__,
-                context_input_ids=context_input_ids,
-                context_attention_mask=context_attention_mask,
-                doc_scores=doc_scores,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-                n_docs=generator_n_docs,
-            )
-
-    def check_model_with_encoder_outputs(
-            self, config, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, **kwargs
-    ):
-        self.assertIsNotNone(config.question_encoder)
-        self.assertIsNotNone(config.generator)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config, retriever=self.get_retriever(config))
-            model.eval()
-
-            self.assertTrue(model.config.is_encoder_decoder)
-
-            outputs = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            encoder_outputs = BaseModelOutput(outputs.generator_enc_last_hidden_state)
-
-            # run only generator
-            outputs = model(
-                encoder_outputs=encoder_outputs,
-                doc_scores=outputs.doc_scores,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            # logits
-            self.assertEqual(
-                outputs.logits.shape,
-                (self.n_docs * decoder_input_ids.shape[0], decoder_input_ids.shape[1], config.generator.vocab_size),
-            )
-            # generator encoder last hidden states
-            self.assertEqual(
-                outputs.generator_enc_last_hidden_state.shape,
-                (self.n_docs * decoder_input_ids.shape[0], self.max_combined_length, config.generator.hidden_size),
-            )
-            # doc scores
-            self.assertEqual(outputs.doc_scores.shape, (input_ids.shape[0], self.n_docs))
-
-    def test_model_with_retriever(self):
-        inputs_dict = self.config_and_inputs
-        self.check_model_with_retriever(**inputs_dict)
-
-    def test_model_with_end2end_retriever(self):
-        inputs_dict = self.config_and_inputs
-        self.check_model_with_end2end_retriever(**inputs_dict)
-
-    def test_model_without_retriever(self):
-        inputs_dict = self.config_and_inputs
-        self.check_model_without_retriever(**inputs_dict)
-
-    def test_model_with_encoder_outputs(self):
-        inputs_dict = self.config_and_inputs
-        self.check_model_with_encoder_outputs(**inputs_dict)
-
-    def test_model_generate(self):
-
-        inputs_dict = self.config_and_inputs
-        self.check_model_generate(**inputs_dict)
-
-    def test_model_with_custom_n_docs(self):
-        inputs_dict = self.config_and_inputs
-        inputs_dict["n_docs"] = 1
-        self.check_model_custom_n_docs(**inputs_dict)
-
-    def test_model_with_mismatch_n_docs_value(self):
-        inputs_dict = self.config_and_inputs
-        inputs_dict["retriever_n_docs"] = 3
-        inputs_dict["generator_n_docs"] = 2
-        self.check_model_with_mismatch_n_docs_value(**inputs_dict)
-
-
-@require_mindspore
-@require_retrieval
-class RagDPRBartTest(RagTestMixin, unittest.TestCase):
-    @cached_property
-    def config_and_inputs(self):
-        question_encoder_tester = DPRModelTester(self)
-        dpr_config_and_inputs = question_encoder_tester.prepare_config_and_inputs()
-        generator_tester = BartModelTester(self)
-        bart_config_and_inputs = generator_tester.prepare_config_and_inputs_for_common()
-
-        (question_encoder_config, input_ids, _, input_mask, _, _, _) = dpr_config_and_inputs
-        (generator_config, bart_inputs_dict) = bart_config_and_inputs
-        decoder_input_ids, decoder_attention_mask = bart_inputs_dict["input_ids"], bart_inputs_dict["attention_mask"]
-
-        config = RagConfig.from_question_encoder_generator_configs(
-            question_encoder_config,
-            generator_config,
-            n_docs=self.n_docs,
-            retrieval_vector_size=self.retrieval_vector_size,
-            max_combined_length=self.max_combined_length,
-        )
-
-        return {
-            "config": config,
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-
-@require_mindspore
-@require_retrieval
-class RagDPRT5Test(RagTestMixin, unittest.TestCase):
-    @cached_property
-    def config_and_inputs(self):
-        question_encoder_tester = DPRModelTester(self)
-        dpr_config_and_inputs = question_encoder_tester.prepare_config_and_inputs()
-        generator_tester = T5ModelTester(self, vocab_size=1100)
-        t5_config_and_inputs = generator_tester.prepare_config_and_inputs()
-
-        (question_encoder_config, input_ids, _, input_mask, _, _, _) = dpr_config_and_inputs
-        (generator_config, _, decoder_input_ids, _, decoder_attention_mask, _) = t5_config_and_inputs
-        config = RagConfig.from_question_encoder_generator_configs(
-            question_encoder_config,
-            generator_config,
-            n_docs=self.n_docs,
-            retrieval_vector_size=self.retrieval_vector_size,
-            max_combined_length=self.max_combined_length,
-        )
-
-        return {
-            "config": config,
-            "input_ids": input_ids,
-            "attention_mask": input_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-
-@require_mindspore
-@require_retrieval
-@require_sentencepiece
-@require_tokenizers
-# @require_mindspore_gpu
-class RagModelIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-
-    @cached_property
-    def sequence_model(self):
-        return (
-            RagSequenceForGeneration.from_pretrained_question_encoder_generator(
-                "facebook/dpr-question_encoder-single-nq-base", "facebook/bart-large-cnn"
-            ).eval()
-        )
-
-    @cached_property
-    def token_model(self):
-        return (
-            RagTokenForGeneration.from_pretrained_question_encoder_generator(
-                "facebook/dpr-question_encoder-single-nq-base", "facebook/bart-large-cnn"
-            ).eval()
-        )
-
-    def get_rag_config(self):
-        question_encoder_config = AutoConfig.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
-        generator_config = AutoConfig.from_pretrained("facebook/bart-large-cnn")
-        return RagConfig.from_question_encoder_generator_configs(
-            question_encoder_config,
-            generator_config,
-            bos_token_id=0,
-            decoder_start_token_id=2,
-            eos_token_id=2,
-            is_encoder_decoder=True,
-            pad_token_id=1,
-            vocab_size=50264,
-            title_sep=" / ",
-            doc_sep=" // ",
-            n_docs=5,
-            max_combined_length=300,
-            dataset="wiki_dpr",
-            dataset_split="train",
-            index_name="exact",
-            index_path=None,
-            use_dummy_dataset=True,
-            retrieval_vector_size=768,
-            retrieval_batch_size=8,
-            dataset_revision="b24a417",
-        )
-
-    @slow
-    def test_rag_sequence_inference(self):
-
-        rag_config = self.get_rag_config()
-        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base"
-        )
-        rag_retriever = RagRetriever(
-            rag_config,
-            question_encoder_tokenizer=rag_question_encoder_tokenizer,
-            generator_tokenizer=rag_decoder_tokenizer,
-        )
-
-        rag_sequence = self.sequence_model
-        rag_sequence.set_retriever(rag_retriever)
-
-        input_ids = rag_question_encoder_tokenizer(
-            "who sings does he love me with reba", return_tensors="ms"
-        ).input_ids
-        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="ms").input_ids
-
-        with mindspore._no_grad():
-            output = rag_sequence(
-                input_ids,
-                labels=decoder_input_ids,
-            )
-
-        expected_shape = (5, 5, 50264)
-        self.assertEqual(output.logits.shape, expected_shape)
-
-        expected_doc_scores = mindspore.tensor([[75.0286, 74.4998, 74.0804, 74.0306, 73.9504]])
-        _assert_tensors_equal(expected_doc_scores, output.doc_scores, atol=TOLERANCE)
-
-        # expected_loss = mindspore.tensor([36.7368])
-        # _assert_tensors_equal(expected_loss, output.loss, atol=TOLERANCE)
-
-    @slow
-    def test_rag_token_inference(self):
-
-        rag_config = self.get_rag_config()
-        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base"
-        )
-        rag_retriever = RagRetriever(
-            rag_config,
-            question_encoder_tokenizer=rag_question_encoder_tokenizer,
-            generator_tokenizer=rag_decoder_tokenizer,
-        )
-
-        rag_token = self.token_model
-        rag_token.set_retriever(rag_retriever)
-
-        input_ids = rag_question_encoder_tokenizer(
-            "who sings does he love me with reba", return_tensors="ms"
-        ).input_ids
-        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="ms").input_ids
-
-        input_ids = input_ids
-        decoder_input_ids = decoder_input_ids
-
-        with mindspore._no_grad():
-            output = rag_token(
-                input_ids,
-                labels=decoder_input_ids,
-            )
-
-        expected_shape = (5, 5, 50264)
-        self.assertEqual(output.logits.shape, expected_shape)
-
-        expected_doc_scores = mindspore.tensor([[75.0286, 74.4998, 74.0804, 74.0306, 73.9504]])
-        _assert_tensors_equal(expected_doc_scores, output.doc_scores, atol=TOLERANCE)
-
-        # expected_loss = mindspore.tensor([36.3557])
-        # _assert_tensors_equal(expected_loss, output.loss, atol=TOLERANCE)
-
-    @slow
-    def test_rag_token_generate_beam(self):
-
-        rag_config = self.get_rag_config()
-        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base"
-        )
-        rag_retriever = RagRetriever(
-            rag_config,
-            question_encoder_tokenizer=rag_question_encoder_tokenizer,
-            generator_tokenizer=rag_decoder_tokenizer,
-        )
-
-        rag_token = self.token_model
-        rag_token.set_retriever(rag_retriever)
-
-        input_ids = rag_question_encoder_tokenizer(
-            "who sings does he love me with reba", return_tensors="ms"
-        ).input_ids
-
-        input_ids = input_ids
-
-        output_ids = rag_token.generate(
-            input_ids,
-            decoder_start_token_id=rag_token.generator.config.decoder_start_token_id,
-            num_beams=2,
-            num_return_sequences=2,
-        )
-        # sequence generate test
-        output_text_1 = rag_decoder_tokenizer.decode(output_ids[0], skip_special_tokens=True)
-        output_text_2 = rag_decoder_tokenizer.decode(output_ids[1], skip_special_tokens=True)
-
-        # Expected outputs as given by model at integration time.
-        EXPECTED_OUTPUT_TEXT_1 = "\"She's My Kind of Girl"
-        EXPECTED_OUTPUT_TEXT_2 = "\"She's My Kind of Love"
-
-        self.assertEqual(output_text_1, EXPECTED_OUTPUT_TEXT_1)
-        self.assertEqual(output_text_2, EXPECTED_OUTPUT_TEXT_2)
-
-    @slow
-    def test_rag_sequence_generate_beam(self):
-
-        rag_config = self.get_rag_config()
-        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base"
-        )
-        rag_retriever = RagRetriever(
-            rag_config,
-            question_encoder_tokenizer=rag_question_encoder_tokenizer,
-            generator_tokenizer=rag_decoder_tokenizer
-        )
-
-        rag_sequence = self.sequence_model
-        rag_sequence.set_retriever(rag_retriever)
-
-        input_ids = rag_question_encoder_tokenizer(
-            "who sings does he love me with reba", return_tensors="ms"
-        ).input_ids
-
-        input_ids = input_ids
-
-        output_ids = rag_sequence.generate(
-            input_ids,
-            decoder_start_token_id=rag_sequence.generator.config.decoder_start_token_id,
-            num_beams=2,
-            num_return_sequences=2,
-        )
-        # sequence generate test
-        output_text_1 = rag_decoder_tokenizer.decode(output_ids[0], skip_special_tokens=True)
-        output_text_2 = rag_decoder_tokenizer.decode(output_ids[1], skip_special_tokens=True)
-
-        # Expected outputs as given by model at integration time.
-        EXPECTED_OUTPUT_TEXT_1 = """\"She's My Kind of Girl\" was released through Epic Records in Japan in March 1972, giving the duo a Top 10 hit. Two more singles were released in Japan, \"En Carousel\" and \"Love Has Its Ways\" Ulvaeus and Andersson persevered with their songwriting and experimented with new sounds and vocal arrangements."""
-        EXPECTED_OUTPUT_TEXT_2 = """In September 2018, Björn Ulvaeus revealed that the two new songs, \"I Still Have Faith In You\" and \"Don't Shut Me Down\", would be released no earlier than March 2019. The two new tracks will feature in a TV special set to air later in the year."""
-
-        self.assertEqual(output_text_1, EXPECTED_OUTPUT_TEXT_1)
-        self.assertEqual(output_text_2, EXPECTED_OUTPUT_TEXT_2)
-
-    @property
-    def test_data_questions(self):
-        return [
-            "who got the first nobel prize in physics",
-            "when is the next deadpool movie being released",
-            "which mode is used for short wave broadcast service",
-            "who is the owner of reading football club",
-            "when is the next scandal episode coming out",
-            "when is the last time the philadelphia won the superbowl",
-            "what is the most current adobe flash player version",
-            "how many episodes are there in dragon ball z",
-        ]
-
-    @slow
-    def test_rag_sequence_generate_batch(self): # success
-
-        tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq", timeout=100)
-        retriever = RagRetriever.from_pretrained(
-            "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417",
-            trust_remote_code=True,
-            timeout=100
-        )
-        rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever,
-                                                                timeout=100)
-
-        input_dict = tokenizer(
-            self.test_data_questions,
-            return_tensors="ms",
-            padding=True,
-            truncation=True,
-        )
-
-        input_ids = input_dict.input_ids
-        attention_mask = input_dict.attention_mask
-
-        output_ids = rag_sequence.generate(
-            input_ids,
-            attention_mask=attention_mask,
-        )
-
-        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-        EXPECTED_OUTPUTS = [
-            " albert einstein",
-            " june 22, 2018",
-            " amplitude modulation",
-            " tim besley ( chairman )",
-            " june 20, 2018",
-            " 1980",
-            " 7.0",
-            " 8",
-        ]
-        self.assertListEqual(outputs, EXPECTED_OUTPUTS)
-
-    @slow
-    def test_rag_sequence_generate_batch_from_context_input_ids(self):
-
-        tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
-        retriever = RagRetriever.from_pretrained(
-            "facebook/rag-sequence-nq",
-            index_name="exact",
-            use_dummy_dataset=True,
-            dataset_revision="b24a417",
-            trust_remote_code=True
-        )
-        rag_sequence = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
-
-        input_dict = tokenizer(
-            self.test_data_questions,
-            return_tensors="ms",
-            padding=True,
-            truncation=True,
-        )
-
-        input_ids = input_dict.input_ids
-        attention_mask = input_dict.attention_mask
-
-        question_hidden_states = rag_sequence.question_encoder(input_ids, attention_mask=attention_mask)[0]
-        docs_dict = retriever(
-            input_ids.numpy(), question_hidden_states.numpy(), return_tensors="ms"
-        )
-        doc_scores = ops.bmm(
-            question_hidden_states.unsqueeze(1),
-            ops.transpose(docs_dict["retrieved_doc_embeds"],1, 2),
-        ).squeeze(1)
-
-
-        output_ids = rag_sequence.generate(
-            context_input_ids=docs_dict["context_input_ids"],
-            context_attention_mask=docs_dict["context_attention_mask"],
-            doc_scores=doc_scores,
-            do_deduplication=True,
-        )
-
-        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-        EXPECTED_OUTPUTS = [
-            " albert einstein",
-            " june 22, 2018",
-            " amplitude modulation",
-            " tim besley ( chairman )",
-            " june 20, 2018",
-            " 1980",
-            " 7.0",
-            " 8",
-        ]
-        self.assertListEqual(outputs, EXPECTED_OUTPUTS)
-
-    @slow
-    def test_rag_token_generate_batch(self):
-
-        tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
-        retriever = RagRetriever.from_pretrained(
-            "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True, dataset_revision="b24a417",
-            trust_remote_code=True
-        )
-        rag_token = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
-
-        input_dict = tokenizer(
-            self.test_data_questions,
-            return_tensors="ms",
-            padding=True,
-            truncation=True,
-        )
-
-        input_ids = input_dict.input_ids
-        attention_mask = input_dict.attention_mask
-
-        output_ids = rag_token.generate(
-            input_ids,
-            attention_mask=attention_mask,
-        )
-
-        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-        EXPECTED_OUTPUTS = [
-            " albert einstein",
-            " september 22, 2017",
-            " amplitude modulation",
-            " stefan persson",
-            " april 20, 2018",
-            " the 1970s",
-            " 7.1. 2",
-            " 13",
-        ]
-        self.assertListEqual(outputs, EXPECTED_OUTPUTS)
-
-
-@require_mindspore
-@require_retrieval
-class RagModelSaveLoadTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-
-    def get_rag_config(self):
-        question_encoder_config = AutoConfig.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
-        generator_config = AutoConfig.from_pretrained("facebook/bart-large-cnn")
-        return RagConfig.from_question_encoder_generator_configs(
-            question_encoder_config,
-            generator_config,
-            bos_token_id=0,
-            decoder_start_token_id=2,
-            eos_token_id=2,
-            is_encoder_decoder=True,
-            pad_token_id=1,
-            vocab_size=50264,
-            title_sep=" / ",
-            doc_sep=" // ",
-            n_docs=5,
-            max_combined_length=300,
-            dataset="wiki_dpr",
-            dataset_split="train",
-            index_name="exact",
-            index_path=None,
-            use_dummy_dataset=True,
-            retrieval_vector_size=768,
-            retrieval_batch_size=8,
-            dataset_revision="b24a417",
-        )
-
-    @slow
-    def test_rag_sequence_from_pretrained(self):
-
-        rag_config = self.get_rag_config()
-        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base"
-        )
-        rag_retriever = RagRetriever(
-            rag_config,
-            question_encoder_tokenizer=rag_question_encoder_tokenizer,
-            generator_tokenizer=rag_decoder_tokenizer,
-        )
-
-        input_ids = rag_question_encoder_tokenizer(
-            "who sings does he love me with reba", return_tensors="ms"
-        ).input_ids
-        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="ms").input_ids
-
-        input_ids = input_ids
-        decoder_input_ids = decoder_input_ids
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            rag_sequence = RagSequenceForGeneration.from_pretrained_question_encoder_generator(
-                "facebook/dpr-question_encoder-single-nq-base",
-                "facebook/bart-large-cnn",
-                retriever=rag_retriever,
-                config=rag_config,
-            )
-            # check that the from pretrained methods work
-            rag_sequence.save_pretrained(tmp_dirname)
-            rag_sequence.from_pretrained(tmp_dirname, retriever=rag_retriever)
-
-
-            with mindspore._no_grad():
-                output = rag_sequence(
-                    input_ids,
-                    labels=decoder_input_ids,
-                )
-
-            loss_pretrained = output.loss
-            del rag_sequence
-
-        question_encoder = AutoModel.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
-        generator = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
-        rag_sequence = RagSequenceForGeneration(
-            config=rag_config, question_encoder=question_encoder, generator=generator, retriever=rag_retriever
-        )
-
-
-        with mindspore._no_grad():
-            output = rag_sequence(
-                input_ids,
-                labels=decoder_input_ids,
-            )
-
-        loss_init = output.loss
-
-        self.assertAlmostEqual(loss_pretrained.item(), loss_init.item(), places=4)
-
-    @slow
-    def test_rag_token_from_pretrained(self):
-
-        rag_config = self.get_rag_config()
-        rag_decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
-        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base", timeout = 100
-        )
-        rag_retriever = RagRetriever(
-            rag_config,
-            question_encoder_tokenizer=rag_question_encoder_tokenizer,
-            generator_tokenizer=rag_decoder_tokenizer,
-        )
-
-        input_ids = rag_question_encoder_tokenizer(
-            "who sings does he love me with reba", return_tensors="ms"
-        ).input_ids
-        decoder_input_ids = rag_decoder_tokenizer("Linda Davis", return_tensors="ms").input_ids
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            rag_token = RagTokenForGeneration.from_pretrained_question_encoder_generator(
-                "facebook/dpr-question_encoder-single-nq-base",
-                "facebook/bart-large-cnn",
-                retriever=rag_retriever,
-                config=rag_config,
-                question_encoder_max_length=200,
-                generator_max_length=200,
-                timeout = 100,
-            )
-            # check that the from pretrained methods work
-            rag_token.save_pretrained(tmp_dirname)
-            rag_token.from_pretrained(tmp_dirname, retriever=rag_retriever)
-
-
-            self.assertTrue(rag_token.question_encoder.config.max_length == 200)
-            self.assertTrue(rag_token.generator.config.max_length == 200)
-
-            with mindspore._no_grad():
-                output = rag_token(
-                    input_ids,
-                    labels=decoder_input_ids,
-                )
-
-            loss_pretrained = output.loss
-            del rag_token
-
-        question_encoder = AutoModel.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
-        generator = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
-        rag_token = RagTokenForGeneration(
-            config=rag_config, question_encoder=question_encoder, generator=generator, retriever=rag_retriever
-        )
-
-
-        with mindspore._no_grad():
-            output = rag_token(
-                input_ids,
-                labels=decoder_input_ids,
-            )
-
-        loss_init = output.loss
-
-        self.assertAlmostEqual(loss_pretrained.item(), loss_init.item(), places=4)
diff --git a/tests/transformers/models/rag/test_retrieval_rag.py b/tests/transformers/models/rag/test_retrieval_rag.py
deleted file mode 100644
index cb437dc1b..000000000
--- a/tests/transformers/models/rag/test_retrieval_rag.py
+++ /dev/null
@@ -1,320 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import shutil
-import tempfile
-from unittest import TestCase
-from unittest.mock import patch
-
-import numpy as np
-from datasets import Dataset
-
-from mindnlp.transformers.models.bart.configuration_bart import BartConfig
-from mindnlp.transformers.models.bart.tokenization_bart import BartTokenizer
-from mindnlp.transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
-from mindnlp.transformers.models.dpr.configuration_dpr import DPRConfig
-from mindnlp.transformers.models.dpr.tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer
-from mindnlp.transformers.models.rag.configuration_rag import RagConfig
-from mindnlp.transformers.models.rag.retrieval_rag import CustomHFIndex, RagRetriever
-from mindnlp.utils import is_faiss_available
-from mindnlp.transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
-from mindnlp.utils.testing_utils import require_sentencepiece, require_tokenizers, require_mindspore
-
-if is_faiss_available():
-    import faiss
-else:
-    raise ImportError("please install faiss")
-
-
-class RagRetrieverTest(TestCase):
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-        self.retrieval_vector_size = 8
-
-        # DPR tok
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        dpr_tokenizer_path = os.path.join(self.tmpdirname, "dpr_tokenizer")
-        os.makedirs(dpr_tokenizer_path, exist_ok=True)
-        self.vocab_file = os.path.join(dpr_tokenizer_path, DPR_VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-        # BART tok
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "\u0120",
-            "\u0120l",
-            "\u0120n",
-            "\u0120lo",
-            "\u0120low",
-            "er",
-            "\u0120lowest",
-            "\u0120newer",
-            "\u0120wider",
-            "<unk>",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        bart_tokenizer_path = os.path.join(self.tmpdirname, "bart_tokenizer")
-        os.makedirs(bart_tokenizer_path, exist_ok=True)
-        self.vocab_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    def get_dpr_tokenizer(self) -> DPRQuestionEncoderTokenizer:
-        return DPRQuestionEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
-
-    def get_dpr_ctx_encoder_tokenizer(self) -> DPRContextEncoderTokenizer:
-        return DPRContextEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
-
-    def get_bart_tokenizer(self) -> BartTokenizer:
-        return BartTokenizer.from_pretrained(os.path.join(self.tmpdirname, "bart_tokenizer"))
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def get_dummy_dataset(self):
-        dataset = Dataset.from_dict(
-            {
-                "id": ["0", "1"],
-                "text": ["foo", "bar"],
-                "title": ["Foo", "Bar"],
-                "embeddings": [np.ones(self.retrieval_vector_size), 2 * np.ones(self.retrieval_vector_size)],
-            }
-        )
-        dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT)
-        return dataset
-
-    def get_dummy_canonical_hf_index_retriever(self):
-        dataset = self.get_dummy_dataset()
-        config = RagConfig(
-            retrieval_vector_size=self.retrieval_vector_size,
-            question_encoder=DPRConfig().to_dict(),
-            generator=BartConfig().to_dict(),
-        )
-        with patch("mindnlp.transformers.models.rag.retrieval_rag.load_dataset") as mock_load_dataset:
-            mock_load_dataset.return_value = dataset
-            retriever = RagRetriever(
-                config,
-                question_encoder_tokenizer=self.get_dpr_tokenizer(),
-                generator_tokenizer=self.get_bart_tokenizer(),
-            )
-        return retriever
-
-    def get_dummy_custom_hf_index_retriever(self, from_disk: bool):
-        dataset = self.get_dummy_dataset()
-        config = RagConfig(
-            retrieval_vector_size=self.retrieval_vector_size,
-            question_encoder=DPRConfig().to_dict(),
-            generator=BartConfig().to_dict(),
-            index_name="custom",
-        )
-        if from_disk:
-            config.passages_path = os.path.join(self.tmpdirname, "dataset")
-            config.index_path = os.path.join(self.tmpdirname, "index.faiss")
-            dataset.get_index("embeddings").save(os.path.join(self.tmpdirname, "index.faiss"))
-            dataset.drop_index("embeddings")
-            dataset.save_to_disk(os.path.join(self.tmpdirname, "dataset"))
-            del dataset
-            retriever = RagRetriever(
-                config,
-                question_encoder_tokenizer=self.get_dpr_tokenizer(),
-                generator_tokenizer=self.get_bart_tokenizer(),
-            )
-        else:
-            retriever = RagRetriever(
-                config,
-                question_encoder_tokenizer=self.get_dpr_tokenizer(),
-                generator_tokenizer=self.get_bart_tokenizer(),
-                index=CustomHFIndex(config.retrieval_vector_size, dataset),
-            )
-        return retriever
-
-    def test_canonical_hf_index_retriever_retrieve(self):
-        n_docs = 1
-        retriever = self.get_dummy_canonical_hf_index_retriever()
-        hidden_states = np.array(
-            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-        )
-        retrieved_doc_embeds, doc_ids, doc_dicts = retriever.retrieve(hidden_states, n_docs=n_docs)
-        self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
-        self.assertEqual(len(doc_dicts), 2)
-        self.assertEqual(sorted(doc_dicts[0]), ["embeddings", "id", "text", "title"])
-        self.assertEqual(len(doc_dicts[0]["id"]), n_docs)
-        self.assertEqual(doc_dicts[0]["id"][0], "1")  # max inner product is reached with second doc
-        self.assertEqual(doc_dicts[1]["id"][0], "0")  # max inner product is reached with first doc
-        self.assertListEqual(doc_ids.tolist(), [[1], [0]])
-
-    def test_canonical_hf_index_retriever_save_and_from_pretrained(self):
-        retriever = self.get_dummy_canonical_hf_index_retriever()
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            with patch("mindnlp.transformers.models.rag.retrieval_rag.load_dataset") as mock_load_dataset:
-                mock_load_dataset.return_value = self.get_dummy_dataset()
-                retriever.save_pretrained(tmp_dirname)
-                retriever = RagRetriever.from_pretrained(tmp_dirname)
-                self.assertIsInstance(retriever, RagRetriever)
-                hidden_states = np.array(
-                    [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-                )
-                out = retriever.retrieve(hidden_states, n_docs=1)
-                self.assertTrue(out is not None)
-
-    def test_custom_hf_index_retriever_retrieve(self):
-        n_docs = 1
-        retriever = self.get_dummy_custom_hf_index_retriever(from_disk=False)
-        hidden_states = np.array(
-            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-        )
-        retrieved_doc_embeds, doc_ids, doc_dicts = retriever.retrieve(hidden_states, n_docs=n_docs)
-        self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
-        self.assertEqual(len(doc_dicts), 2)
-        self.assertEqual(sorted(doc_dicts[0]), ["embeddings", "id", "text", "title"])
-        self.assertEqual(len(doc_dicts[0]["id"]), n_docs)
-        self.assertEqual(doc_dicts[0]["id"][0], "1")  # max inner product is reached with second doc
-        self.assertEqual(doc_dicts[1]["id"][0], "0")  # max inner product is reached with first doc
-        self.assertListEqual(doc_ids.tolist(), [[1], [0]])
-
-    def test_custom_hf_index_retriever_save_and_from_pretrained(self):
-        retriever = self.get_dummy_custom_hf_index_retriever(from_disk=False)
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            retriever.save_pretrained(tmp_dirname)
-            retriever = RagRetriever.from_pretrained(tmp_dirname)
-            self.assertIsInstance(retriever, RagRetriever)
-            hidden_states = np.array(
-                [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-            )
-            out = retriever.retrieve(hidden_states, n_docs=1)
-            self.assertTrue(out is not None)
-
-    def test_custom_hf_index_retriever_retrieve_from_disk(self):
-        n_docs = 1
-        retriever = self.get_dummy_custom_hf_index_retriever(from_disk=True)
-        hidden_states = np.array(
-            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-        )
-        retrieved_doc_embeds, doc_ids, doc_dicts = retriever.retrieve(hidden_states, n_docs=n_docs)
-        self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
-        self.assertEqual(len(doc_dicts), 2)
-        self.assertEqual(sorted(doc_dicts[0]), ["embeddings", "id", "text", "title"])
-        self.assertEqual(len(doc_dicts[0]["id"]), n_docs)
-        self.assertEqual(doc_dicts[0]["id"][0], "1")  # max inner product is reached with second doc
-        self.assertEqual(doc_dicts[1]["id"][0], "0")  # max inner product is reached with first doc
-        self.assertListEqual(doc_ids.tolist(), [[1], [0]])
-
-    def test_custom_hf_index_retriever_save_and_from_pretrained_from_disk(self):
-        retriever = self.get_dummy_custom_hf_index_retriever(from_disk=True)
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            retriever.save_pretrained(tmp_dirname)
-            retriever = RagRetriever.from_pretrained(tmp_dirname)
-            self.assertIsInstance(retriever, RagRetriever)
-            hidden_states = np.array(
-                [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-            )
-            out = retriever.retrieve(hidden_states, n_docs=1)
-            self.assertTrue(out is not None)
-
-    @require_mindspore
-    @require_tokenizers
-    @require_sentencepiece
-    def test_hf_index_retriever_call(self):
-        import mindspore
-
-        n_docs = 1
-        retriever = self.get_dummy_canonical_hf_index_retriever()
-        question_input_ids = [[5, 7], [10, 11]]
-        hidden_states = np.array(
-            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-        )
-        out = retriever(question_input_ids, hidden_states, prefix=retriever.config.generator.prefix, n_docs=n_docs)
-        context_input_ids, context_attention_mask, retrieved_doc_embeds = (
-            out["context_input_ids"],
-            out["context_attention_mask"],
-            out["retrieved_doc_embeds"],
-        )
-        self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
-        self.assertIsInstance(context_input_ids, list)
-        self.assertIsInstance(context_attention_mask, list)
-        self.assertIsInstance(retrieved_doc_embeds, np.ndarray)
-
-        out = retriever(
-            question_input_ids,
-            hidden_states,
-            prefix=retriever.config.generator.prefix,
-            n_docs=n_docs,
-            return_tensors="ms",
-        )
-        context_input_ids, context_attention_mask, retrieved_doc_embeds, doc_ids = (  # noqa: F841
-            out["context_input_ids"],
-            out["context_attention_mask"],
-            out["retrieved_doc_embeds"],
-            out["doc_ids"],
-        )
-        self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
-        self.assertIsInstance(context_input_ids, mindspore.Tensor)
-        self.assertIsInstance(context_attention_mask, mindspore.Tensor)
-        self.assertIsInstance(retrieved_doc_embeds, mindspore.Tensor)
-
-    @require_mindspore
-    @require_tokenizers
-    @require_sentencepiece
-    def test_custom_hf_index_end2end_retriever_call(self):
-        context_encoder_tokenizer = self.get_dpr_ctx_encoder_tokenizer()
-        n_docs = 1
-        retriever = self.get_dummy_custom_hf_index_retriever(from_disk=False)
-        retriever.set_ctx_encoder_tokenizer(context_encoder_tokenizer)
-
-        question_input_ids = [[5, 7], [10, 11]]
-        hidden_states = np.array(
-            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-        )
-        out = retriever(question_input_ids, hidden_states, prefix=retriever.config.generator.prefix, n_docs=n_docs)
-
-        self.assertEqual(
-            len(out), 6
-        )  # check whether the retriever output consist of 6 attributes including tokenized docs
-        self.assertEqual(
-            all(k in out for k in ("tokenized_doc_ids", "tokenized_doc_attention_mask")), True
-        )  # check for doc token related keys in dictionary.
diff --git a/tests/transformers/models/rag/test_tokenization_rag.py b/tests/transformers/models/rag/test_tokenization_rag.py
deleted file mode 100644
index 4aeea66cc..000000000
--- a/tests/transformers/models/rag/test_tokenization_rag.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import shutil
-import tempfile
-from unittest import TestCase
-
-from mindnlp.transformers import BartTokenizer, BartTokenizerFast, DPRQuestionEncoderTokenizer, \
-    DPRQuestionEncoderTokenizerFast
-from mindnlp.transformers.models.bart.configuration_bart import BartConfig
-from mindnlp.transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
-from mindnlp.transformers.models.dpr.configuration_dpr import DPRConfig
-from mindnlp.utils import is_datasets_available, is_faiss_available
-from mindnlp.transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_tokenizers, require_mindspore, slow
-
-if is_mindspore_available() and is_datasets_available() and is_faiss_available():
-    from mindnlp.transformers.models.rag.configuration_rag import RagConfig
-    from mindnlp.transformers.models.rag.tokenization_rag import RagTokenizer
-
-
-@require_mindspore
-class RagTokenizerTest(TestCase):
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-        self.retrieval_vector_size = 8
-
-        # DPR tok
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        dpr_tokenizer_path = os.path.join(self.tmpdirname, "dpr_tokenizer")
-        os.makedirs(dpr_tokenizer_path, exist_ok=True)
-        self.vocab_file = os.path.join(dpr_tokenizer_path, DPR_VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-        # BART tok
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "\u0120",
-            "\u0120l",
-            "\u0120n",
-            "\u0120lo",
-            "\u0120low",
-            "er",
-            "\u0120lowest",
-            "\u0120newer",
-            "\u0120wider",
-            "<unk>",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        bart_tokenizer_path = os.path.join(self.tmpdirname, "bart_tokenizer")
-        os.makedirs(bart_tokenizer_path, exist_ok=True)
-        self.vocab_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    def get_dpr_tokenizer(self) -> DPRQuestionEncoderTokenizer:
-        return DPRQuestionEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
-
-    def get_bart_tokenizer(self) -> BartTokenizer:
-        return BartTokenizer.from_pretrained(os.path.join(self.tmpdirname, "bart_tokenizer"))
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    @require_tokenizers
-    def test_save_load_pretrained_with_saved_config(self):
-        save_dir = os.path.join(self.tmpdirname, "rag_tokenizer")
-        rag_config = RagConfig(question_encoder=DPRConfig().to_dict(), generator=BartConfig().to_dict())
-        rag_tokenizer = RagTokenizer(question_encoder=self.get_dpr_tokenizer(), generator=self.get_bart_tokenizer())
-        rag_config.save_pretrained(save_dir)
-        rag_tokenizer.save_pretrained(save_dir)
-        new_rag_tokenizer = RagTokenizer.from_pretrained(save_dir, config=rag_config)
-        self.assertIsInstance(new_rag_tokenizer.question_encoder, DPRQuestionEncoderTokenizerFast)
-        self.assertEqual(new_rag_tokenizer.question_encoder.get_vocab(), rag_tokenizer.question_encoder.get_vocab())
-        self.assertIsInstance(new_rag_tokenizer.generator, BartTokenizerFast)
-        self.assertEqual(new_rag_tokenizer.generator.get_vocab(), rag_tokenizer.generator.get_vocab())
-
-    @slow
-    def test_pretrained_token_nq_tokenizer(self):
-        tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
-        input_strings = [
-            "who got the first nobel prize in physics",
-            "when is the next deadpool movie being released",
-            "which mode is used for short wave broadcast service",
-            "who is the owner of reading football club",
-            "when is the next scandal episode coming out",
-            "when is the last time the philadelphia won the superbowl",
-            "what is the most current adobe flash player version",
-            "how many episodes are there in dragon ball z",
-            "what is the first step in the evolution of the eye",
-            "where is gall bladder situated in human body",
-            "what is the main mineral in lithium batteries",
-            "who is the president of usa right now",
-            "where do the greasers live in the outsiders",
-            "panda is a national animal of which country",
-            "what is the name of manchester united stadium",
-        ]
-        input_dict = tokenizer(input_strings)
-        self.assertIsNotNone(input_dict)
-
-    @slow
-    def test_pretrained_sequence_nq_tokenizer(self):
-        tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
-        input_strings = [
-            "who got the first nobel prize in physics",
-            "when is the next deadpool movie being released",
-            "which mode is used for short wave broadcast service",
-            "who is the owner of reading football club",
-            "when is the next scandal episode coming out",
-            "when is the last time the philadelphia won the superbowl",
-            "what is the most current adobe flash player version",
-            "how many episodes are there in dragon ball z",
-            "what is the first step in the evolution of the eye",
-            "where is gall bladder situated in human body",
-            "what is the main mineral in lithium batteries",
-            "who is the president of usa right now",
-            "where do the greasers live in the outsiders",
-            "panda is a national animal of which country",
-            "what is the name of manchester united stadium",
-        ]
-        input_dict = tokenizer(input_strings)
-        self.assertIsNotNone(input_dict)
diff --git a/tests/transformers/models/realm/__init__.py b/tests/transformers/models/realm/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/realm/test_modeling_realm.py b/tests/transformers/models/realm/test_modeling_realm.py
deleted file mode 100644
index 41e264f55..000000000
--- a/tests/transformers/models/realm/test_modeling_realm.py
+++ /dev/null
@@ -1,547 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the PyTorch REALM model. """
-
-import copy
-import unittest
-
-import numpy as np
-
-from mindnlp.utils.testing_utils import slow, require_mindspore, is_mindspore_available
-from mindnlp.transformers import RealmConfig
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import (
-        RealmEmbedder,
-        RealmForOpenQA,
-        RealmKnowledgeAugEncoder,
-        RealmReader,
-        RealmRetriever,
-        RealmScorer,
-        RealmTokenizer,
-    )
-
-
-class RealmModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        retriever_proj_size=128,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=5,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        span_hidden_size=50,
-        max_span_width=10,
-        reader_layer_norm_eps=1e-3,
-        reader_beam_size=4,
-        reader_seq_len=288 + 32,
-        num_block_records=13353718,
-        searcher_beam_size=8,
-        searcher_seq_len=64,
-        num_labels=3,
-        num_choices=4,
-        num_candidates=10,
-        scope=None,
-    ):
-        # General config
-        self.parent = parent
-        self.batch_size = batch_size
-        self.retriever_proj_size = retriever_proj_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-
-        # Reader config
-        self.span_hidden_size = span_hidden_size
-        self.max_span_width = max_span_width
-        self.reader_layer_norm_eps = reader_layer_norm_eps
-        self.reader_beam_size = reader_beam_size
-        self.reader_seq_len = reader_seq_len
-
-        # Searcher config
-        self.num_block_records = num_block_records
-        self.searcher_beam_size = searcher_beam_size
-        self.searcher_seq_len = searcher_seq_len
-
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.num_candidates = num_candidates
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        candiate_input_ids = ids_tensor([self.batch_size, self.num_candidates, self.seq_length], self.vocab_size)
-        reader_input_ids = ids_tensor([self.reader_beam_size, self.reader_seq_len], self.vocab_size)
-
-        input_mask = None
-        candiate_input_mask = None
-        reader_input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-            candiate_input_mask = random_attention_mask([self.batch_size, self.num_candidates, self.seq_length])
-            reader_input_mask = random_attention_mask([self.reader_beam_size, self.reader_seq_len])
-
-        token_type_ids = None
-        candidate_token_type_ids = None
-        reader_token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-            candidate_token_type_ids = ids_tensor(
-                [self.batch_size, self.num_candidates, self.seq_length], self.type_vocab_size
-            )
-            reader_token_type_ids = ids_tensor([self.reader_beam_size, self.reader_seq_len], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        # inputs with additional num_candidates axis.
-        scorer_encoder_inputs = (candiate_input_ids, candiate_input_mask, candidate_token_type_ids)
-        # reader inputs
-        reader_inputs = (reader_input_ids, reader_input_mask, reader_token_type_ids)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            scorer_encoder_inputs,
-            reader_inputs,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return RealmConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            retriever_proj_size=self.retriever_proj_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_candidates=self.num_candidates,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_embedder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        scorer_encoder_inputs,
-        reader_inputs,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RealmEmbedder(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.projected_score.shape, (self.batch_size, self.retriever_proj_size))
-
-    def create_and_check_encoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        scorer_encoder_inputs,
-        reader_inputs,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RealmKnowledgeAugEncoder(config=config)
-        model.eval()
-        relevance_score = floats_tensor([self.batch_size, self.num_candidates])
-        result = model(
-            scorer_encoder_inputs[0],
-            attention_mask=scorer_encoder_inputs[1],
-            token_type_ids=scorer_encoder_inputs[2],
-            relevance_score=relevance_score,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size * self.num_candidates, self.seq_length, self.vocab_size)
-        )
-
-    def create_and_check_reader(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        scorer_encoder_inputs,
-        reader_inputs,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RealmReader(config=config)
-        model.eval()
-        relevance_score = floats_tensor([self.reader_beam_size])
-        result = model(
-            reader_inputs[0],
-            attention_mask=reader_inputs[1],
-            token_type_ids=reader_inputs[2],
-            relevance_score=relevance_score,
-        )
-        self.parent.assertEqual(result.block_idx.shape, ())
-        self.parent.assertEqual(result.candidate.shape, ())
-        self.parent.assertEqual(result.start_pos.shape, ())
-        self.parent.assertEqual(result.end_pos.shape, ())
-
-    def create_and_check_scorer(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        scorer_encoder_inputs,
-        reader_inputs,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RealmScorer(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            candidate_input_ids=scorer_encoder_inputs[0],
-            candidate_attention_mask=scorer_encoder_inputs[1],
-            candidate_token_type_ids=scorer_encoder_inputs[2],
-        )
-        self.parent.assertEqual(result.relevance_score.shape, (self.batch_size, self.num_candidates))
-        self.parent.assertEqual(result.query_score.shape, (self.batch_size, self.retriever_proj_size))
-        self.parent.assertEqual(
-            result.candidate_score.shape, (self.batch_size, self.num_candidates, self.retriever_proj_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            scorer_encoder_inputs,
-            reader_inputs,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class RealmModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            RealmEmbedder,
-            RealmKnowledgeAugEncoder,
-            # RealmScorer is excluded from common tests as it is a container model
-            # consisting of two RealmEmbedders & a simple inner product calculation.
-            # RealmScorer
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = ()
-    pipeline_model_mapping = {} if is_mindspore_available() else {}
-
-    # disable these tests because there is no base_model in Realm
-    test_save_load_fast_init_from_base = False
-    test_save_load_fast_init_to_base = False
-
-    def setUp(self):
-        self.test_pruning = False
-        self.model_tester = RealmModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RealmConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_embedder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_embedder(*config_and_inputs)
-
-    def test_encoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_encoder(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_embedder(*config_and_inputs)
-            self.model_tester.create_and_check_encoder(*config_and_inputs)
-
-    def test_scorer(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_scorer(*config_and_inputs)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, *inputs = self.model_tester.prepare_config_and_inputs()
-        input_ids, token_type_ids, input_mask, scorer_encoder_inputs = inputs[0:4]
-        config.return_dict = True
-
-        tokenizer = RealmTokenizer.from_pretrained("google/realm-orqa-nq-openqa")
-
-        # RealmKnowledgeAugEncoder training
-        model = RealmKnowledgeAugEncoder(config)
-        model.train()
-
-        inputs_dict = {
-            "input_ids": scorer_encoder_inputs[0],
-            "attention_mask": scorer_encoder_inputs[1],
-            "token_type_ids": scorer_encoder_inputs[2],
-            "relevance_score": floats_tensor([self.model_tester.batch_size, self.model_tester.num_candidates]),
-        }
-        inputs_dict["labels"] = ops.zeros(
-            (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int64
-        )
-        inputs = inputs_dict
-        loss = model(**inputs).loss
-        # loss.backward()
-
-        # RealmForOpenQA training
-        openqa_config = copy.deepcopy(config)
-        openqa_config.vocab_size = 30522  # the retrieved texts will inevitably have more than 99 vocabs.
-        openqa_config.num_block_records = 5
-        openqa_config.searcher_beam_size = 2
-
-        block_records = np.array(
-            [
-                b"This is the first record.",
-                b"This is the second record.",
-                b"This is the third record.",
-                b"This is the fourth record.",
-                b"This is the fifth record.",
-            ],
-            dtype=object,
-        )
-        retriever = RealmRetriever(block_records, tokenizer)
-        model = RealmForOpenQA(openqa_config, retriever)
-        model.train()
-
-        inputs_dict = {
-            "input_ids": input_ids[:1],
-            "attention_mask": input_mask[:1],
-            "token_type_ids": token_type_ids[:1],
-            "answer_ids": input_ids[:1].tolist(),
-        }
-        inputs = self._prepare_for_class(inputs_dict, RealmForOpenQA)
-        loss = model(**inputs).reader_output.loss
-        # loss.backward()
-
-        # Test model.block_embedding_to
-        loss = model(**inputs).reader_output.loss
-        # loss.backward()
-
-    @slow
-    def test_embedder_from_pretrained(self):
-        model = RealmEmbedder.from_pretrained("google/realm-cc-news-pretrained-embedder")
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_encoder_from_pretrained(self):
-        model = RealmKnowledgeAugEncoder.from_pretrained("google/realm-cc-news-pretrained-encoder")
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_open_qa_from_pretrained(self):
-        model = RealmForOpenQA.from_pretrained("google/realm-orqa-nq-openqa")
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_reader_from_pretrained(self):
-        model = RealmReader.from_pretrained("google/realm-orqa-nq-reader")
-        self.assertIsNotNone(model)
-
-    @slow
-    def test_scorer_from_pretrained(self):
-        model = RealmScorer.from_pretrained("google/realm-cc-news-pretrained-scorer")
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class RealmModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_embedder(self):
-        retriever_projected_size = 128
-
-        model = RealmEmbedder.from_pretrained("google/realm-cc-news-pretrained-embedder")
-        input_ids = mindspore.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        expected_shape = (1, retriever_projected_size)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([[-0.0714, -0.0837, -0.1314]])
-        self.assertTrue(np.allclose(output[:, :3].numpy(), expected_slice.numpy(), atol=1e-4))
-
-    @slow
-    def test_inference_encoder(self):
-        num_candidates = 2
-        vocab_size = 30522
-
-        model = RealmKnowledgeAugEncoder.from_pretrained(
-            "google/realm-cc-news-pretrained-encoder", num_candidates=num_candidates
-        )
-        input_ids = mindspore.tensor([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
-        relevance_score = mindspore.tensor([[0.3, 0.7]], dtype=mindspore.float32)
-        output = model(input_ids, relevance_score=relevance_score)[0]
-
-        expected_shape = (2, 6, vocab_size)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([[[-11.0888, -11.2544], [-10.2170, -10.3874]]])
-
-        self.assertTrue(np.allclose(output[1, :2, :2].numpy(), expected_slice.numpy(), atol=1e-4))
-
-    @slow
-    def test_inference_open_qa(self):
-        from mindnlp.transformers.models.realm.retrieval_realm import RealmRetriever
-
-        tokenizer = RealmTokenizer.from_pretrained("google/realm-orqa-nq-openqa")
-        retriever = RealmRetriever.from_pretrained("google/realm-orqa-nq-openqa")
-
-        model = RealmForOpenQA.from_pretrained(
-            "google/realm-orqa-nq-openqa",
-            retriever=retriever,
-        )
-
-        question = "Who is the pioneer in modern computer science?"
-
-        question = tokenizer(
-            [question],
-            padding=True,
-            truncation=True,
-            max_length=model.config.searcher_seq_len,
-            return_tensors="ms",
-        )
-        print(question)
-        predicted_answer_ids = model(**question).predicted_answer_ids
-        predicted_answer_ids = mindspore.tensor([ 5070,  8785, 10929, 28639])
-        predicted_answer = tokenizer.decode(predicted_answer_ids)
-        self.assertEqual(predicted_answer, "alan mathison turing")
-
-    @slow
-    def test_inference_reader(self):
-        config = RealmConfig(reader_beam_size=2, max_span_width=3)
-        model = RealmReader.from_pretrained("google/realm-orqa-nq-reader", config=config)
-
-        concat_input_ids = ops.arange(10).view((2, 5))
-        concat_token_type_ids = mindspore.tensor([[0, 0, 1, 1, 1], [0, 0, 1, 1, 1]], dtype=mindspore.int64)
-        concat_block_mask = mindspore.tensor([[0, 0, 1, 1, 0], [0, 0, 1, 1, 0]], dtype=mindspore.int64)
-        relevance_score = mindspore.tensor([0.3, 0.7], dtype=mindspore.float32)
-
-        output = model(
-            concat_input_ids,
-            token_type_ids=concat_token_type_ids,
-            relevance_score=relevance_score,
-            block_mask=concat_block_mask,
-            return_dict=True,
-        )
-
-        block_idx_expected_shape = (1,)
-        start_pos_expected_shape = (1,)
-        end_pos_expected_shape = (1,)
-
-        self.assertEqual(output.block_idx.shape, block_idx_expected_shape)
-        self.assertEqual(output.start_pos.shape, start_pos_expected_shape)
-        self.assertEqual(output.end_pos.shape, end_pos_expected_shape)
-
-        expected_block_idx = mindspore.tensor(1)
-        expected_start_pos = mindspore.tensor(3)
-        expected_end_pos = mindspore.tensor(3)
-
-        self.assertTrue(np.allclose(output.block_idx.numpy(), expected_block_idx.numpy(), atol=1e-4))
-        self.assertTrue(np.allclose(output.start_pos.numpy(), expected_start_pos.numpy(), atol=1e-4))
-        self.assertTrue(np.allclose(output.end_pos.numpy(), expected_end_pos.numpy(), atol=1e-4))
-
-    @slow
-    def test_inference_scorer(self):
-        num_candidates = 2
-
-        model = RealmScorer.from_pretrained("google/realm-cc-news-pretrained-scorer", num_candidates=num_candidates)
-
-        input_ids = mindspore.tensor([[0, 1, 2, 3, 4, 5]])
-        candidate_input_ids = mindspore.tensor([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
-        output = model(input_ids, candidate_input_ids=candidate_input_ids)[0]
-
-        expected_shape = (1, 2)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([[0.7410, 0.7170]])
-        self.assertTrue(np.allclose(output.numpy(), expected_slice.numpy(), atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/reformer/__init__.py b/tests/transformers/models/reformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/reformer/test_modeling_reformer.py b/tests/transformers/models/reformer/test_modeling_reformer.py
deleted file mode 100644
index 2109db101..000000000
--- a/tests/transformers/models/reformer/test_modeling_reformer.py
+++ /dev/null
@@ -1,1299 +0,0 @@
-# coding=utf-8 # Copyright 2020 Huggingface
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from mindnlp.transformers import ReformerConfig
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    is_mindspore_available,
-    require_mindspore,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        ReformerForMaskedLM,
-        ReformerForQuestionAnswering,
-        ReformerForSequenceClassification,
-        ReformerLayer,
-        ReformerModel,
-        ReformerModelWithLMHead,
-        ReformerTokenizer,
-    )
-
-
-class ReformerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=32,
-        is_training=True,
-        is_decoder=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=32,
-        attention_head_size=16,
-        hidden_size=32,
-        num_attention_heads=2,
-        local_attn_chunk_length=4,
-        local_num_chunks_before=1,
-        local_num_chunks_after=0,
-        num_buckets=None,
-        num_hashes=1,
-        lsh_attn_chunk_length=None,
-        lsh_num_chunks_before=None,
-        lsh_num_chunks_after=None,
-        chunk_size_lm_head=0,
-        chunk_size_feed_forward=0,
-        feed_forward_size=32,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        local_attention_probs_dropout_prob=0.1,
-        lsh_attention_probs_dropout_prob=None,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        axial_norm_std=1.0,
-        layer_norm_eps=1e-12,
-        axial_pos_embds=True,
-        axial_pos_shape=[4, 8],
-        axial_pos_embds_dim=[16, 16],
-        attn_layers=["local", "local", "local", "local"],
-        pad_token_id=0,
-        eos_token_id=2,
-        scope=None,
-        hash_seed=0,
-        num_labels=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.is_decoder = is_decoder
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.attention_head_size = attention_head_size
-        self.hidden_size = hidden_size
-        self.num_attention_heads = num_attention_heads
-        self.num_hidden_layers = len(attn_layers) if attn_layers is not None else 0
-        self.local_attn_chunk_length = local_attn_chunk_length
-        self.local_num_chunks_after = local_num_chunks_after
-        self.local_num_chunks_before = local_num_chunks_before
-        self.num_hashes = num_hashes
-        self.num_buckets = tuple(num_buckets) if isinstance(num_buckets, list) else num_buckets
-        self.lsh_attn_chunk_length = lsh_attn_chunk_length
-        self.lsh_num_chunks_after = lsh_num_chunks_after
-        self.lsh_num_chunks_before = lsh_num_chunks_before
-        self.hidden_act = hidden_act
-        self.feed_forward_size = feed_forward_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.local_attention_probs_dropout_prob = local_attention_probs_dropout_prob
-        self.lsh_attention_probs_dropout_prob = lsh_attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.axial_pos_embds = axial_pos_embds
-        self.axial_pos_shape = tuple(axial_pos_shape)
-        self.axial_pos_embds_dim = tuple(axial_pos_embds_dim)
-        self.axial_norm_std = axial_norm_std
-        self.chunk_size_lm_head = chunk_size_lm_head
-        self.chunk_size_feed_forward = chunk_size_feed_forward
-        self.scope = scope
-        self.attn_layers = attn_layers
-        self.pad_token_id = pad_token_id
-        self.hash_seed = hash_seed
-
-        attn_chunk_length = local_attn_chunk_length if local_attn_chunk_length is not None else lsh_attn_chunk_length
-        num_chunks_after = local_num_chunks_after if local_num_chunks_after is not None else lsh_num_chunks_after
-        num_chunks_before = local_num_chunks_before if local_num_chunks_before is not None else lsh_num_chunks_before
-
-        self.encoder_seq_length = seq_length // attn_chunk_length + (self.seq_length % attn_chunk_length != 0)
-        self.key_length = (num_chunks_before + num_chunks_after + 1) * attn_chunk_length
-        self.chunk_length = attn_chunk_length
-        self.num_labels = num_labels
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        choice_labels = None
-        if self.use_labels:
-            choice_labels = ids_tensor([self.batch_size], 2)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return ReformerConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            feed_forward_size=self.feed_forward_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            local_attention_probs_dropout_prob=self.local_attention_probs_dropout_prob,
-            lsh_attention_probs_dropout_prob=self.lsh_attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            is_decoder=self.is_decoder,
-            axial_pos_embds=self.axial_pos_embds,
-            axial_pos_shape=self.axial_pos_shape,
-            axial_pos_embds_dim=self.axial_pos_embds_dim,
-            local_attn_chunk_length=self.local_attn_chunk_length,
-            local_num_chunks_after=self.local_num_chunks_after,
-            local_num_chunks_before=self.local_num_chunks_before,
-            num_hashes=self.num_hashes,
-            num_buckets=self.num_buckets,
-            lsh_attn_chunk_length=self.lsh_attn_chunk_length,
-            lsh_num_chunks_after=self.lsh_num_chunks_after,
-            lsh_num_chunks_before=self.lsh_num_chunks_before,
-            attn_layers=self.attn_layers,
-            pad_token_id=self.pad_token_id,
-            hash_seed=self.hash_seed,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 100
-        config.max_position_embeddings = 100
-        config.axial_pos_shape = (4, 25)
-        config.is_decoder = False
-        return config
-
-    def create_and_check_reformer_model(self, config, input_ids, input_mask, choice_labels):
-        model = ReformerModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-
-        # 2 * hidden_size because we use reversible resnet layers
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.seq_length, 2 * self.hidden_size)
-        )
-
-    def create_and_check_reformer_model_with_lm_backward(self, config, input_ids, input_mask, choice_labels):
-        config.is_decoder = False
-        config.lsh_num_chunks_after = 1
-        model = ReformerForMaskedLM(config=config)
-        model.train()
-        loss = model(input_ids, attention_mask=input_mask, labels=input_ids)["loss"]
-        loss.backward()
-
-    def create_and_check_reformer_with_lm(self, config, input_ids, input_mask, choice_labels):
-        config.lsh_num_chunks_after = 0
-        config.is_decoder = True
-        model = ReformerModelWithLMHead(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=input_ids)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_reformer_with_mlm(self, config, input_ids, input_mask, choice_labels):
-        config.is_decoder = False
-        model = ReformerForMaskedLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=input_ids)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_reformer_model_with_attn_mask(
-        self, config, input_ids, input_mask, choice_labels, is_decoder=False
-    ):
-        # no special position embeddings
-        config.axial_pos_embds = False
-        config.is_decoder = is_decoder
-
-        if self.lsh_attn_chunk_length is not None:
-            # need to set chunk length equal sequence length to be certain that chunking works
-            config.lsh_attn_chunk_length = self.seq_length
-
-        model = ReformerModel(config=config)
-        model.eval()
-        # set all position encodings to zero so that postions don't matter
-        with no_grad():
-            embedding = model.embeddings.position_embeddings.embedding
-            embedding.weight = nn.Parameter(ops.zeros(embedding.weight.shape))
-            embedding.weight.requires_grad = False
-
-        half_seq_len = self.seq_length // 2
-        roll = self.chunk_length
-
-        half_input_ids = input_ids[:, :half_seq_len]
-
-        # normal padded
-        attn_mask = ops.cat(
-            [ops.ones_like(half_input_ids), ops.zeros_like(half_input_ids)],
-            dim=-1,
-        )
-        input_ids_padded = ops.cat(
-            [half_input_ids, ids_tensor((self.batch_size, half_seq_len), self.vocab_size)],
-            dim=-1,
-        )
-
-        # shifted padded
-        input_ids_roll = ops.cat(
-            [half_input_ids, ids_tensor((self.batch_size, half_seq_len), self.vocab_size)],
-            dim=-1,
-        )
-        input_ids_roll = ops.roll(input_ids_roll, roll, dims=-1)
-        attn_mask_roll = ops.roll(attn_mask, roll, dims=-1)
-
-        output_padded = model(input_ids_padded, attention_mask=attn_mask)[0][:, :half_seq_len]
-        output_padded_rolled = model(input_ids_roll, attention_mask=attn_mask_roll)[0][:, roll : half_seq_len + roll]
-
-        self.parent.assertTrue(ops.allclose(output_padded, output_padded_rolled, atol=1e-3))
-
-    def create_and_check_reformer_layer_dropout_seed(
-        self, config, input_ids, input_mask, choice_labels, is_decoder=False
-    ):
-        config.is_decoder = is_decoder
-        layer = ReformerLayer(config)
-        layer.train()
-        shape = (
-            self.batch_size,
-            self.seq_length,
-            config.hidden_size,
-        )  # Batch x SeqLen x hiddenSize
-
-        # get random tensors
-        hidden_states = floats_tensor(shape)
-        prev_attn_output = floats_tensor(shape)
-
-        # now the random seeds for attention and feed forward is initialized
-        # forward tensors with dropout
-        layer_outputs = layer(prev_attn_output, hidden_states, attention_mask=input_mask)
-
-        next_attn_output = layer_outputs.attn_output
-        next_hidden_states = layer_outputs.hidden_states
-
-        mindspore.manual_seed(layer.attention_seed)
-        mindspore.set_seed(layer.attention_seed)
-        attn_outputs = layer.attention(hidden_states, attention_mask=input_mask)
-        self.parent.assertTrue(
-            ops.allclose(
-                prev_attn_output + attn_outputs.hidden_states,
-                next_attn_output,
-                atol=1e-3,
-            )
-        )
-
-        mindspore.manual_seed(layer.feed_forward_seed)
-        mindspore.set_seed(layer.feed_forward_seed)
-        feed_forward_hidden_states = layer.feed_forward(next_attn_output)
-        self.parent.assertTrue(
-            ops.allclose(
-                next_hidden_states,
-                hidden_states + feed_forward_hidden_states,
-                atol=1e-3,
-            )
-        )
-
-    def create_and_check_reformer_feed_backward_chunking(self, config, input_ids, input_mask, choice_labels):
-        # disable dropout
-        config.hidden_dropout_prob = 0
-        config.local_attention_probs_dropout_prob = 0
-        config.lsh_attention_probs_dropout_prob = 0
-        config.lsh_num_chunks_after = 1
-        config.is_decoder = False
-
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        model = ReformerForMaskedLM(config=config)
-        model.train()
-        model.zero_grad()
-        loss_no_chunk, output_no_chunk = model(input_ids, labels=input_ids, attention_mask=input_mask)[:2]
-        loss_no_chunk.backward()
-        grad_slice_word_no_chunk = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
-        grad_slice_position_factor_1_no_chunk = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
-        grad_slice_position_factor_2_no_chunk = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
-
-        config.chunk_size_lm_head = 1
-        config.chunk_size_feed_forward = 1
-
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        model = ReformerForMaskedLM(config=config)
-        model.train()
-        model.zero_grad()
-        loss_chunk, output_chunk = model(input_ids, labels=input_ids, attention_mask=input_mask)[:2]
-        loss_chunk.backward()
-        grad_slice_word_chunk = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
-        grad_slice_position_factor_1_chunk = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
-        grad_slice_position_factor_2_chunk = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
-        self.parent.assertTrue(ops.allclose(loss_chunk, loss_no_chunk, atol=1e-3))
-        self.parent.assertTrue(ops.allclose(grad_slice_word_no_chunk, grad_slice_word_chunk, atol=1e-3))
-        self.parent.assertTrue(
-            ops.allclose(grad_slice_position_factor_1_chunk, grad_slice_position_factor_1_no_chunk, atol=1e-3)
-        )
-        self.parent.assertTrue(
-            ops.allclose(grad_slice_position_factor_2_chunk, grad_slice_position_factor_2_no_chunk, atol=1e-3)
-        )
-
-    def create_and_check_reformer_random_seed(self, config, input_ids, input_mask, choice_labels):
-        layer = ReformerLayer(config)
-        layer.train()
-
-        shape = (
-            self.batch_size,
-            self.seq_length,
-            config.hidden_size,
-        )  # Batch x SeqLen x hiddenSize
-
-        hidden_states = floats_tensor(shape)
-        attn_output = floats_tensor(shape)
-
-        seeds = []
-        for _ in range(100):
-            layer_outputs = layer(attn_output, hidden_states, attention_mask=input_mask)
-            attn_output = layer_outputs.attn_output
-            hidden_states = layer_outputs.hidden_states
-            mindspore.manual_seed(layer.attention_seed)
-            mindspore.set_seed(layer.attention_seed)
-            seeds.append(layer.attention_seed)
-        self.parent.assertGreater(len(set(seeds)), 70)
-
-        seeds = []
-        for _ in range(100):
-            layer_outputs = layer(attn_output, hidden_states, attention_mask=input_mask)
-            attn_output = layer_outputs.attn_output
-            hidden_states = layer_outputs.hidden_states
-            mindspore.manual_seed(layer.feed_forward_seed)
-            mindspore.set_seed(layer.feed_forward_seed)
-            seeds.append(layer.feed_forward_seed)
-        self.parent.assertGreater(len(set(seeds)), 70)
-
-    def create_and_check_reformer_model_fp16_forward(self, config, input_ids, input_mask, choice_labels):
-        model = ReformerModel(config=config)
-        model.half()
-        model.eval()
-        output = model(input_ids, attention_mask=input_mask)["last_hidden_state"]
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-    def create_and_check_reformer_model_generate(self, config, input_ids, input_mask, choice_labels):
-        config.is_decoder = True
-        config.lsh_num_chunks_after = 0
-        config.bos_token_id = 0
-        config.eos_token_id = None
-        config.max_length = 20
-
-        model = ReformerModelWithLMHead(config=config)
-        model.eval()
-        output = model.generate()
-        self.parent.assertIsNotNone(output)
-
-    def create_and_check_reformer_model_fp16_generate(self, config, input_ids, input_mask, choice_labels):
-        config.is_decoder = True
-        config.lsh_num_chunks_after = 0
-        model = ReformerModelWithLMHead(config=config)
-        model.half()
-        model.eval()
-        # only use last 10 inputs for generation
-        output = model.generate(input_ids[:, -10:], attention_mask=input_mask, do_sample=False)
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-    def create_and_check_reformer_no_chunking(self, config, input_ids, input_mask, choice_labels):
-        # force chunk length to be bigger than input_ids
-        config.lsh_attn_chunk_length = 2 * input_ids.shape[-1]
-        config.local_attn_chunk_length = 2 * input_ids.shape[-1]
-        config.lsh_num_chunks_after = 1
-        config.is_decoder = False
-        model = ReformerForMaskedLM(config=config)
-        model.eval()
-        output_logits = model(input_ids, attention_mask=input_mask)["logits"]
-        self.parent.assertTrue(output_logits.shape[1] == input_ids.shape[-1])
-
-    def create_and_check_reformer_for_question_answering(self, config, input_ids, input_mask, choice_labels):
-        model = ReformerForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            start_positions=choice_labels,
-            end_positions=choice_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_past_buckets_states(self, config, input_ids, input_mask, choice_labels):
-        config.is_decoder = True
-        config.lsh_num_chunks_before = 1
-        config.lsh_num_chunks_after = 0
-        model = ReformerModelWithLMHead(config=config)
-        model.eval()
-        input_ids_first = input_ids[:, :-1]
-        input_ids_second = input_ids[:, -1:]
-
-        # return saved cache
-        past_buckets_states = model(input_ids_first, use_cache=True)["past_buckets_states"]
-
-        # calculate last output with and without cache
-        outputs_with_cache = model(input_ids_second, past_buckets_states=past_buckets_states, use_cache=True)["logits"]
-        outputs_without_cache = model(input_ids)["logits"][:, -1]
-
-        # select random slice idx
-        random_slice_idx = ops.randint(0, outputs_without_cache.shape[-1], (1, 1)).item()
-
-        # outputs should be similar within range
-        self.parent.assertTrue(
-            ops.allclose(
-                outputs_with_cache[:, 0, random_slice_idx], outputs_without_cache[:, random_slice_idx], atol=1e-2
-            )
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, input_ids, input_mask, choice_labels) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-    def create_and_check_reformer_for_sequence_classification(
-        self, config, input_ids, input_mask, choice_labels, is_decoder
-    ):
-        config.is_decoder = is_decoder
-        sequence_labels = ids_tensor([self.batch_size], config.num_labels)
-        model = ReformerForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-
-class ReformerTesterMixin:
-    """
-    Reformer Local and Reformer LSH run essentially the same tests
-    """
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_reformer_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reformer_model(*config_and_inputs)
-
-    def test_reformer_lm_model_backward(self):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="model_tester.is_training is set to False")
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reformer_model_with_lm_backward(*config_and_inputs)
-
-    def test_reformer_model_attn_masking(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reformer_model_with_attn_mask(*config_and_inputs, is_decoder=True)
-        self.model_tester.create_and_check_reformer_model_with_attn_mask(*config_and_inputs, is_decoder=False)
-
-    def test_reformer_with_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reformer_with_lm(*config_and_inputs)
-
-    def test_reformer_with_mlm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reformer_with_mlm(*config_and_inputs)
-
-    def test_reformer_layer_training_dropout(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reformer_layer_dropout_seed(*config_and_inputs, is_decoder=True)
-        self.model_tester.create_and_check_reformer_layer_dropout_seed(*config_and_inputs, is_decoder=False)
-
-    @unittest.skip
-    def test_reformer_chunking_backward_equality(self):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="model_tester.is_training is set to False")
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reformer_feed_backward_chunking(*config_and_inputs)
-
-    def test_reformer_no_chunking(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reformer_no_chunking(*config_and_inputs)
-
-    def test_reformer_qa_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reformer_for_question_answering(*config_and_inputs)
-
-    def test_reformer_cached_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_past_buckets_states(*config_and_inputs)
-
-    def test_reformer_cached_generate(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reformer_model_generate(*config_and_inputs)
-
-    @slow
-    def test_dropout_random_seed_is_changing(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reformer_random_seed(*config_and_inputs)
-
-    def test_reformer_model_fp16_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reformer_model_fp16_forward(*config_and_inputs)
-
-    def test_reformer_model_fp16_generate(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reformer_model_fp16_generate(*config_and_inputs)
-
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_reformer_for_sequence_classification(*config_and_inputs, is_decoder=False)
-
-    @unittest.skip(reason="Reformer cannot keep gradients in attentions or hidden states")
-    def test_retain_grad_hidden_states_attentions(self):
-        return
-
-    @unittest.skip(reason="Reformer cannot resize embeddings that easily")
-    def test_resize_embeddings_untied(self):
-        return
-
-
-@require_mindspore
-class ReformerLocalAttnModelTest(ReformerTesterMixin, GenerationTesterMixin, ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (ReformerModel, ReformerModelWithLMHead, ReformerForSequenceClassification, ReformerForQuestionAnswering)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (ReformerModelWithLMHead,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-    test_sequence_classification_problem_types = True
-
-    def setUp(self):
-        self.model_tester = ReformerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=37)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/reformer-crime-and-punishment"
-        model = ReformerModelWithLMHead.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def _check_attentions_for_generate(
-        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        self.assertIsInstance(attentions, tuple)
-        self.assertListEqual(
-            [isinstance(iter_attentions, list) for iter_attentions in attentions], [True] * len(attentions)
-        )
-        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
-
-        for idx, iter_attentions in enumerate(attentions):
-            tgt_len = min_length + idx if not use_cache else 1
-            num_chunks = tgt_len // config.local_attn_chunk_length + (tgt_len % config.local_attn_chunk_length != 0)
-            tgt_chunk_len = config.local_attn_chunk_length
-            src_chunk_len = config.local_attn_chunk_length * (
-                1 + config.local_num_chunks_after + config.local_num_chunks_before
-            )
-
-            if use_cache:
-                expected_shape = (
-                    batch_size * num_beam_groups,
-                    config.num_attention_heads,
-                    tgt_len,
-                    min_length // config.local_attn_chunk_length + 1 + idx,
-                )
-            else:
-                expected_shape = (
-                    batch_size * num_beam_groups,
-                    config.num_attention_heads,
-                    num_chunks,
-                    tgt_chunk_len,
-                    src_chunk_len,
-                )
-            # check attn size
-            self.assertListEqual(
-                [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions)
-            )
-
-    def _check_hidden_states_for_generate(
-        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        self.assertIsInstance(hidden_states, tuple)
-        self.assertListEqual(
-            [isinstance(iter_hidden_states, list) for iter_hidden_states in hidden_states],
-            [True] * len(hidden_states),
-        )
-        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
-
-        for idx, iter_hidden_states in enumerate(hidden_states):
-            seq_len = min_length + idx
-            seq_len = config.local_attn_chunk_length * (
-                seq_len // config.local_attn_chunk_length + (seq_len % config.local_attn_chunk_length != 0)
-            )
-
-            if use_cache:
-                seq_len = 1
-
-            expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
-            # check hidden size
-            self.assertListEqual(
-                [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states],
-                [expected_shape] * len(iter_hidden_states),
-            )
-
-    @unittest.skip(reason="The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
-    def test_left_padding_compatibility(self):
-        pass
-
-    def _get_input_ids_and_config(self, batch_size=2):
-        # override because overwise we hit max possible seq length for model (4*8=32)
-        # decreasing the seq_length in tester causes errors for "training_tests", those need exactly max seq length
-        # NOTE: seq_length has to be multiple of 4, otherwise it fails for other tests
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.pop(self.input_name)
-        _ = inputs_dict.pop("attention_mask", None)
-        _ = inputs_dict.pop("decoder_input_ids", None)
-        _ = inputs_dict.pop("decoder_attention_mask", None)
-        input_ids = input_ids[:batch_size, :16]
-        attention_mask = ops.ones_like(input_ids, dtype=mindspore.int64)[:batch_size, :16]
-        config.eos_token_id = None
-        config.forced_eos_token_id = None
-        return config, input_ids, attention_mask, inputs_dict
-
-@require_mindspore
-class ReformerLSHAttnModelTest(
-    ReformerTesterMixin, ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
-):
-    all_model_classes = (
-        (ReformerModel, ReformerModelWithLMHead, ReformerForSequenceClassification, ReformerForQuestionAnswering)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (ReformerModelWithLMHead,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": ReformerModel,
-            "fill-mask": ReformerForMaskedLM,
-            "question-answering": ReformerForQuestionAnswering,
-            "text-classification": ReformerForSequenceClassification,
-            "text-generation": ReformerModelWithLMHead,
-            "zero-shot": ReformerForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if (
-            pipeline_test_casse_name == "QAPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
-            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
-            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = ReformerModelTester(
-            self,
-            batch_size=13,
-            seq_length=13,
-            use_input_mask=True,
-            use_labels=True,
-            is_training=False,
-            is_decoder=True,
-            vocab_size=32,
-            attention_head_size=16,
-            hidden_size=64,
-            num_attention_heads=2,
-            num_buckets=2,
-            num_hashes=4,
-            lsh_attn_chunk_length=4,
-            lsh_num_chunks_before=1,
-            lsh_num_chunks_after=0,
-            chunk_size_lm_head=5,
-            chunk_size_feed_forward=6,
-            feed_forward_size=32,
-            hidden_act="relu",
-            hidden_dropout_prob=0.1,
-            lsh_attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            initializer_range=0.02,
-            axial_norm_std=1.0,
-            layer_norm_eps=1e-12,
-            axial_pos_embds=True,
-            axial_pos_shape=[4, 8],
-            axial_pos_embds_dim=[16, 48],
-            # sanotheu
-            # attn_layers=[lsh,lsh,lsh,lsh],
-            attn_layers=["lsh"],
-            pad_token_id=0,
-            eos_token_id=2,
-            scope=None,
-            hash_seed=0,
-            num_labels=2,
-        )
-        self.config_tester = ConfigTester(self, config_class=ReformerConfig, hidden_size=37)
-
-    def _check_attentions_for_generate(
-        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        self.assertIsInstance(attentions, tuple)
-        self.assertListEqual(
-            [isinstance(iter_attentions, list) for iter_attentions in attentions], [True] * len(attentions)
-        )
-        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
-
-        for idx, iter_attentions in enumerate(attentions):
-            tgt_len = min_length + idx if not use_cache else 1
-            num_chunks = tgt_len // config.lsh_attn_chunk_length + (tgt_len % config.lsh_attn_chunk_length != 0)
-            tgt_chunk_len = config.lsh_attn_chunk_length
-            src_chunk_len = config.lsh_attn_chunk_length * (
-                1 + config.lsh_num_chunks_after + config.lsh_num_chunks_before
-            )
-
-            if use_cache:
-                expected_shape = (
-                    batch_size * num_beam_groups,
-                    config.num_attention_heads,
-                    config.num_hashes,
-                    tgt_len,
-                    config.num_hashes * (1 + config.lsh_num_chunks_after + config.lsh_num_chunks_before),
-                )
-            else:
-                expected_shape = (
-                    batch_size * num_beam_groups,
-                    config.num_attention_heads,
-                    num_chunks * config.num_hashes,
-                    tgt_chunk_len,
-                    src_chunk_len,
-                )
-            # check attn size
-            self.assertListEqual(
-                [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions)
-            )
-
-    def _check_hidden_states_for_generate(
-        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        self.assertIsInstance(hidden_states, tuple)
-        self.assertListEqual(
-            [isinstance(iter_hidden_states, list) for iter_hidden_states in hidden_states],
-            [True] * len(hidden_states),
-        )
-        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
-
-        for idx, iter_hidden_states in enumerate(hidden_states):
-            seq_len = min_length + idx if not use_cache else 1
-            seq_len = config.lsh_attn_chunk_length * (
-                seq_len // config.lsh_attn_chunk_length + (seq_len % config.lsh_attn_chunk_length != 0)
-            )
-
-            if use_cache:
-                seq_len = 1
-
-            expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
-            # check hidden size
-            self.assertListEqual(
-                [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states],
-                [expected_shape] * len(iter_hidden_states),
-            )
-
-    @unittest.skip(reason="Fails because the sequence length is not a multiple of 4")
-    def test_problem_types(self):
-        pass
-
-    @unittest.skip(reason="Fails because the sequence length is not a multiple of 4")
-    def test_past_key_values_format(self):
-        pass
-
-    @unittest.skip(reason="The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
-    def test_left_padding_compatibility(self):
-        pass
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class ReformerIntegrationTests(unittest.TestCase):
-    """
-    These integration tests test the current layer activations and gradients againts the output of the Hugging Face Reformer model at time of integration: 29/06/2020. During integration, the model was tested against the output of the official Trax ReformerLM model for various cases ("lsh" only, "lsh" only, masked / non-masked, different chunk length, ....). In order to recover the original trax integration tests, one should use patrickvonplaten's fork of trax and the code that lives on the branch `reformer_trax_tests`.
-    """
-
-    def _get_basic_config_and_input(self):
-        config = {
-            "vocab_size": 320,
-            "attention_head_size": 8,
-            "hidden_size": 16,
-            "num_attention_heads": 2,
-            "num_buckets": 2,
-            "num_hashes": 4,
-            "lsh_attn_chunk_length": 4,
-            "local_attn_chunk_length": 4,
-            "lsh_num_chunks_before": 1,
-            "lsh_num_chunks_after": 0,
-            "local_num_chunks_before": 1,
-            "local_num_chunks_after": 0,
-            "chunk_size_lm_head": 0,
-            "chunk_size_feed_forward": 0,
-            "feed_forward_size": 32,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.0,
-            "lsh_attention_probs_dropout_prob": 0.0,
-            "local_attention_probs_dropout_prob": 0.0,
-            "max_position_embeddings": 32,
-            "initializer_range": 0.02,
-            "axial_norm_std": 1.0,
-            "layer_norm_eps": 1e-12,
-            "sinusoidal_pos_embds": False,
-            "axial_pos_embds": True,
-            "axial_pos_shape": [4, 8],
-            "axial_pos_embds_dim": [8, 8],
-            "hash_seed": 0,
-            "is_decoder": True,
-        }
-        return config
-
-    def _get_hidden_states(self):
-        return mindspore.tensor(
-            [
-                [
-                    [
-                        1.90826353e00,
-                        -1.45999730e00,
-                        -6.20405462e-01,
-                        1.52503433e00,
-                        -3.64464232e-01,
-                        -8.27359235e-01,
-                        8.39670803e-01,
-                        2.44492178e-01,
-                        4.98332758e-01,
-                        2.69175139e00,
-                        -7.08081422e-03,
-                        1.04915401e00,
-                        -1.83476661e00,
-                        7.67220476e-01,
-                        2.98580543e-01,
-                        2.84803992e-02,
-                    ],
-                    [
-                        -2.66374286e-02,
-                        4.33497576e-01,
-                        3.10386309e-01,
-                        5.46039944e-01,
-                        -2.47292666e-04,
-                        -7.52305019e-01,
-                        2.39162103e-01,
-                        7.25216186e-01,
-                        -7.58357372e-01,
-                        4.20635998e-01,
-                        -4.04739919e-02,
-                        1.59924145e-01,
-                        2.05135748e00,
-                        -1.15997978e00,
-                        5.37166397e-01,
-                        2.62873606e-01,
-                    ],
-                    [
-                        1.85247482e-01,
-                        7.07046037e-01,
-                        -6.77089715e-01,
-                        -2.24209655e00,
-                        -3.75307980e-02,
-                        -8.59380874e-01,
-                        -2.81027884e00,
-                        1.01276376e00,
-                        -1.69438001e00,
-                        4.17574660e-01,
-                        -1.49196962e00,
-                        -1.76483717e00,
-                        -1.94566312e-01,
-                        -1.71183858e00,
-                        7.72903565e-01,
-                        -1.11557056e00,
-                    ],
-                    [
-                        9.46069193e-01,
-                        1.53417623e-01,
-                        -9.58686996e-01,
-                        1.18126669e-01,
-                        1.75967724e00,
-                        1.62194590e00,
-                        -5.74108159e-01,
-                        6.79920443e-01,
-                        5.44028163e-01,
-                        2.05466114e-01,
-                        -3.63045868e-01,
-                        2.41865062e-01,
-                        3.20348382e-01,
-                        -9.05611176e-01,
-                        -1.92690727e-01,
-                        -1.19917547e00,
-                    ],
-                ]
-            ],
-            dtype=mindspore.float32,
-        )
-
-    def _get_attn_mask(self):
-        return mindspore.tensor([[0, 1, 0, 0]], dtype=mindspore.int64)
-
-    def _get_input_ids_and_mask(self):
-        mask = mindspore.tensor(
-            [
-                [1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1],
-                [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0],
-            ],
-            dtype=mindspore.int64,
-        )
-
-        input_ids = mindspore.tensor(
-            [
-                [
-                    89,
-                    279,
-                    286,
-                    84,
-                    194,
-                    316,
-                    182,
-                    28,
-                    283,
-                    37,
-                    169,
-                    7,
-                    253,
-                    267,
-                    107,
-                    250,
-                    44,
-                    7,
-                    102,
-                    62,
-                    3,
-                    243,
-                    171,
-                    265,
-                    302,
-                    48,
-                    164,
-                    264,
-                    148,
-                    229,
-                    280,
-                    150,
-                ],
-                [
-                    9,
-                    192,
-                    66,
-                    112,
-                    163,
-                    83,
-                    135,
-                    70,
-                    224,
-                    96,
-                    31,
-                    80,
-                    196,
-                    80,
-                    63,
-                    22,
-                    85,
-                    100,
-                    47,
-                    283,
-                    0,
-                    163,
-                    126,
-                    143,
-                    195,
-                    82,
-                    53,
-                    82,
-                    18,
-                    27,
-                    182,
-                    52,
-                ],
-            ],
-            dtype=mindspore.int64,
-        )
-
-        return input_ids, mask
-
-    @unittest.skip
-    def test_lsh_layer_forward(self):
-        config = self._get_basic_config_and_input()
-        config["lsh_num_chunks_before"] = 0
-        config["attn_layers"] = ["lsh"]
-        config["is_decoder"] = False
-        hidden_states = self._get_hidden_states()
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        layer = ReformerLayer(ReformerConfig(**config))
-        layer.eval()
-        reformer_output = layer(prev_attn_output=hidden_states.copy(), hidden_states=hidden_states)
-        output_slice = reformer_output.hidden_states[0, 0, :5]
-        expected_output_slice = mindspore.tensor(
-            [1.6879, -1.3083, -0.4708, 1.3555, -0.6292],
-            dtype=mindspore.float32,
-        )
-        self.assertTrue(ops.allclose(output_slice, expected_output_slice, atol=1e-3))
-
-    @unittest.skip
-    def test_lsh_layer_forward_complex(self):
-        config = self._get_basic_config_and_input()
-        config["lsh_num_chunks_before"] = 0
-        config["attn_layers"] = ["lsh"]
-        config["num_buckets"] = [2, 4]
-        attn_mask = self._get_attn_mask()
-        hidden_states = self._get_hidden_states()
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        layer = ReformerLayer(ReformerConfig(**config))
-        layer.eval()
-        reformer_output = layer(
-            prev_attn_output=hidden_states.copy(),
-            hidden_states=hidden_states,
-            attention_mask=attn_mask,
-        )
-        output_slice = reformer_output.hidden_states[0, 0, :5]
-        expected_output_slice = mindspore.tensor(
-            [1.6439, -1.2306, -0.5108, 1.3006, -0.6537],
-            dtype=mindspore.float32,
-        )
-        self.assertTrue(ops.allclose(output_slice, expected_output_slice, atol=1e-3))
-
-    @unittest.skip
-    def test_local_layer_forward(self):
-        config = self._get_basic_config_and_input()
-        config["local_num_chunks_before"] = 0
-        config["attn_layers"] = ["local"]
-        config["is_decoder"] = False
-        hidden_states = self._get_hidden_states()
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        layer = ReformerLayer(ReformerConfig(**config))
-        layer.eval()
-        reformer_output = layer(prev_attn_output=hidden_states, hidden_states=hidden_states)
-        output_slice = reformer_output.hidden_states[0, 0, :5]
-        expected_output_slice = mindspore.tensor(
-            [1.4212, -2.0576, -0.9688, 1.4599, -0.1344],
-            dtype=mindspore.float32,
-        )
-        self.assertTrue(ops.allclose(output_slice, expected_output_slice, atol=1e-3))
-
-    @unittest.skip
-    def test_local_layer_forward_complex(self):
-        config = self._get_basic_config_and_input()
-        config["local_num_chunks_before"] = 0
-        config["attn_layers"] = ["local"]
-        attn_mask = self._get_attn_mask()
-        hidden_states = self._get_hidden_states()
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        layer = ReformerLayer(ReformerConfig(**config))
-        layer.eval()
-        reformer_output = layer(
-            prev_attn_output=hidden_states,
-            hidden_states=hidden_states,
-            attention_mask=attn_mask,
-        )
-        output_slice = reformer_output.hidden_states[0, 0, :5]
-        expected_output_slice = mindspore.tensor(
-            [1.4750, -2.0235, -0.9743, 1.4463, -0.1269],
-            dtype=mindspore.float32,
-        )
-        self.assertTrue(ops.allclose(output_slice, expected_output_slice, atol=1e-3))
-
-    @unittest.skip
-    def test_lsh_model_forward(self):
-        config = self._get_basic_config_and_input()
-        config["attn_layers"] = ["lsh", "lsh", "lsh", "lsh"]
-        config["num_buckets"] = [2, 4]
-        mindspore.manual_seed(123)
-        mindspore.set_seed(123)
-        model = ReformerModel(ReformerConfig(**config))
-        model.eval()
-        input_ids, attn_mask = self._get_input_ids_and_mask()
-        hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
-        output_slice = hidden_states[0, 0, :5]
-        expected_output_slice = mindspore.tensor(
-            [-0.9896, -0.9396, -1.0831, -0.0597, 0.2456],
-            dtype=mindspore.float32,
-        )
-        print(output_slice)
-        self.assertTrue(ops.allclose(output_slice, expected_output_slice, atol=1e-3))
-
-    @unittest.skip
-    def test_local_model_forward(self):
-        config = self._get_basic_config_and_input()
-        config["attn_layers"] = ["local", "local", "local", "local"]
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        model = ReformerModel(ReformerConfig(**config))
-        model.eval()
-        input_ids, attn_mask = self._get_input_ids_and_mask()
-        hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
-        output_slice = hidden_states[0, 0, :5]
-        expected_output_slice = mindspore.tensor(
-            [-1.6791, 0.7171, 0.1594, 0.4063, 1.2584],
-            dtype=mindspore.float32,
-        )
-        self.assertTrue(ops.allclose(output_slice, expected_output_slice, atol=1e-3))
-
-    @unittest.skip
-    def test_lm_model_forward(self):
-        config = self._get_basic_config_and_input()
-        config["attn_layers"] = ["local", "lsh", "local", "lsh", "local", "lsh"]
-        config["num_buckets"] = [2, 4]
-        config["is_decoder"] = False
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        model = ReformerForMaskedLM(ReformerConfig(**config))
-        model.eval()
-        input_ids, attn_mask = self._get_input_ids_and_mask()
-        hidden_states = model(input_ids=input_ids, attention_mask=attn_mask)[0]
-        output_slice = hidden_states[1, -1, :5]
-        expected_output_slice = mindspore.tensor(
-            [0.1018, -0.2026, 0.2116, 0.0270, -0.1233],
-            dtype=mindspore.float32,
-        )
-
-        self.assertTrue(ops.allclose(output_slice, expected_output_slice, atol=1e-3))
-
-    @unittest.skip
-    def test_local_lm_model_grad(self):
-        config = self._get_basic_config_and_input()
-        config["attn_layers"] = ["local", "local", "local", "local"]
-        config["hidden_dropout_prob"] = 0.0
-        config["local_attention_probs_dropout_prob"] = 0.0
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        model = ReformerModelWithLMHead(ReformerConfig(**config))
-        model.train()
-        model.zero_grad()
-        input_ids, _ = self._get_input_ids_and_mask()
-        loss = model(input_ids=input_ids, labels=input_ids)[0]
-
-        self.assertTrue(ops.allclose(loss, mindspore.tensor(5.8019, dtype=mindspore.float32), atol=1e-3))
-        loss.backward()
-
-        # check last grads to cover all proable errors
-        grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
-        expected_grad_slice_word = mindspore.tensor(
-            [-0.0005, -0.0001, -0.0002, -0.0006, -0.0006],
-            dtype=mindspore.float32,
-        )
-        grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
-        expected_grad_slice_pos_fac_1 = mindspore.tensor(
-            [-0.5235, 0.5704, 0.0922, -0.3140, 0.9928],
-            dtype=mindspore.float32,
-        )
-        grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
-        expected_grad_slice_pos_fac_2 = mindspore.tensor(
-            [1.7960, 1.7668, 0.5593, 0.0907, 1.8342],
-            dtype=mindspore.float32,
-        )
-        self.assertTrue(ops.allclose(grad_slice_word, expected_grad_slice_word, atol=1e-3))
-        self.assertTrue(ops.allclose(grad_slice_position_factor_1, expected_grad_slice_pos_fac_1, atol=1e-3))
-        self.assertTrue(ops.allclose(grad_slice_position_factor_2, expected_grad_slice_pos_fac_2, atol=1e-3))
-
-    @unittest.skip
-    def test_lsh_lm_model_grad(self):
-        config = self._get_basic_config_and_input()
-        config["attn_layers"] = ["lsh", "lsh", "lsh", "lsh"]
-        config["hidden_dropout_prob"] = 0.0
-        config["lsh_attention_probs_dropout_prob"] = 0.0
-        config["num_buckets"] = [2, 4]
-        config["num_hashes"] = 6
-        mindspore.manual_seed(0)
-        mindspore.set_seed(0)
-        model = ReformerModelWithLMHead(ReformerConfig(**config))
-        model.train()
-        model.zero_grad()
-        input_ids, _ = self._get_input_ids_and_mask()
-        loss = model(input_ids=input_ids, labels=input_ids)[0]
-
-        self.assertTrue(ops.allclose(loss, mindspore.tensor(5.7854, dtype=mindspore.float32), atol=1e-3))
-        loss.backward()
-        # check last grads to cover all proable errors
-        grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
-        expected_grad_slice_word = mindspore.tensor(
-            [0.0004, 0.0003, 0.0006, -0.0004, 0.0002],
-            dtype=mindspore.float32,
-        )
-        grad_slice_position_factor_1 = model.reformer.embeddings.position_embeddings.weights[0][1, 0, -5:]
-        expected_grad_slice_pos_fac_1 = mindspore.tensor(
-            [-0.3792, 0.5593, -1.6993, 0.2033, 0.4131],
-            dtype=mindspore.float32,
-        )
-        grad_slice_position_factor_2 = model.reformer.embeddings.position_embeddings.weights[1][0, 1, :5]
-        expected_grad_slice_pos_fac_2 = mindspore.tensor(
-            [-1.4212, -0.3201, -1.1944, 0.1258, 0.2856],
-            dtype=mindspore.float32,
-        )
-        self.assertTrue(ops.allclose(grad_slice_word, expected_grad_slice_word, atol=1e-3))
-        self.assertTrue(ops.allclose(grad_slice_position_factor_1, expected_grad_slice_pos_fac_1, atol=1e-3))
-        self.assertTrue(ops.allclose(grad_slice_position_factor_2, expected_grad_slice_pos_fac_2, atol=1e-3))
-
-    @slow
-    def test_pretrained_generate_crime_and_punish(self):
-        model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment")
-        tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
-        model.eval()
-
-        input_ids = tokenizer.encode("A few months later", return_tensors="ms")
-        output_ids = model.generate(
-            input_ids, max_length=50, num_beams=4, early_stopping=True, do_sample=False, num_hashes=8
-        )
-        output = tokenizer.decode(output_ids[0])
-
-        self.assertEqual(
-            output,
-            "A few months later state expression in his ideas, at the first entrance. He was positively for an inst",
-        )
-
-    @slow
-    def test_pretrained_generate_use_cache_equality(self):
-        model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment")
-        tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
-        model.eval()
-        input_ids = tokenizer.encode("A few months later", return_tensors="ms")
-        output_ids_with_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=False)
-        output_ids_without_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=True)
-
-        output_with_cache = tokenizer.decode(output_ids_with_cache[0])
-        output_without_cache = tokenizer.decode(output_ids_without_cache[0])
-
-        self.assertEqual(output_with_cache, output_without_cache)
\ No newline at end of file
diff --git a/tests/transformers/models/rembert/__init__.py b/tests/transformers/models/rembert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/rembert/test_modeling_rembert.py b/tests/transformers/models/rembert/test_modeling_rembert.py
deleted file mode 100644
index 9fd93203b..000000000
--- a/tests/transformers/models/rembert/test_modeling_rembert.py
+++ /dev/null
@@ -1,501 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore RemBERT model."""
-
-import unittest
-import numpy as np
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        RemBertConfig,
-        RemBertForCausalLM,
-        RemBertForMaskedLM,
-        RemBertForMultipleChoice,
-        RemBertForQuestionAnswering,
-        RemBertForSequenceClassification,
-        RemBertForTokenClassification,
-        RemBertModel,
-    )
-
-
-class RemBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        input_embedding_size=18,
-        output_embedding_size=43,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.input_embedding_size = input_embedding_size
-        self.output_embedding_size = output_embedding_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = RemBertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            input_embedding_size=self.input_embedding_size,
-            output_embedding_size=self.output_embedding_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = RemBertModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = RemBertModel(config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = RemBertForCausalLM(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = RemBertForMaskedLM(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = RemBertForCausalLM(config=config)
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = RemBertForQuestionAnswering(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = RemBertForSequenceClassification(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = RemBertForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = RemBertForMultipleChoice(config=config)
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class RemBertModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            RemBertModel,
-            RemBertForMaskedLM,
-            RemBertForCausalLM,
-            RemBertForMultipleChoice,
-            RemBertForQuestionAnswering,
-            RemBertForSequenceClassification,
-            RemBertForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (RemBertForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": RemBertModel,
-            "fill-mask": RemBertForMaskedLM,
-            "question-answering": RemBertForQuestionAnswering,
-            "text-classification": RemBertForSequenceClassification,
-            "text-generation": RemBertForCausalLM,
-            "token-classification": RemBertForTokenClassification,
-            "zero-shot": RemBertForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    def setUp(self):
-        self.model_tester = RemBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RemBertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/rembert"
-        model = RemBertModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class RemBertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_model(self):
-        # Test exact values at the last hidden layer
-        model = RemBertModel.from_pretrained("google/rembert")
-        input_ids = mindspore.tensor([[312, 56498, 313, 2125, 313]])
-        segment_ids = mindspore.tensor([[0, 0, 0, 1, 1]])
-        with mindspore._no_grad():
-            output = model(input_ids, token_type_ids=segment_ids, output_hidden_states=True)
-
-        hidden_size = 1152
-
-        expected_shape = (1, 5, hidden_size)
-        self.assertEqual(output["last_hidden_state"].shape, expected_shape)
-
-        expected_implementation = mindspore.tensor(
-            [
-                [
-                    [0.0754, -0.2022, 0.1904],
-                    [-0.3354, -0.3692, -0.4791],
-                    [-0.2314, -0.6729, -0.0749],
-                    [-0.0396, -0.3105, -0.4234],
-                    [-0.1571, -0.0525, 0.5353],
-                ]
-            ]
-        )
-
-        # Running on the original tf implementation gives slightly different results here.
-        # Not clear why this variations is present
-        # TODO: Find reason for discrepancy
-        # expected_original_implementation = [[
-        #     [0.07630594074726105, -0.20146065950393677, 0.19107051193714142],
-        #     [-0.3405614495277405, -0.36971670389175415, -0.4808273911476135],
-        #     [-0.22587086260318756, -0.6656315922737122, -0.07844287157058716],
-        #     [-0.04145475849509239, -0.3077218234539032, -0.42316967248916626],
-        #     [-0.15887849032878876, -0.054529931396245956, 0.5356100797653198]
-        # ]]
-
-        self.assertTrue(np.allclose(output["last_hidden_state"][:, :, :3].asnumpy(), expected_implementation.asnumpy(), atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/resnet/__init__.py b/tests/transformers/models/resnet/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/resnet/test_modeling_resnet.py b/tests/transformers/models/resnet/test_modeling_resnet.py
deleted file mode 100644
index b65178a26..000000000
--- a/tests/transformers/models/resnet/test_modeling_resnet.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore ResNet model. """
-
-
-import unittest
-import numpy as np
-from mindnlp.transformers import ResNetConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_backbone_common import BackboneTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import nn, ops
-
-    from mindnlp.transformers import ResNetBackbone, ResNetForImageClassification, ResNetModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import AutoImageProcessor
-
-
-class ResNetModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,
-        image_size=32,
-        num_channels=3,
-        embeddings_size=10,
-        hidden_sizes=[10, 20, 30, 40],
-        depths=[1, 1, 2, 1],
-        is_training=True,
-        use_labels=True,
-        hidden_act="relu",
-        num_labels=3,
-        scope=None,
-        out_features=["stage2", "stage3", "stage4"],
-        out_indices=[2, 3, 4],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.embeddings_size = embeddings_size
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_act = hidden_act
-        self.num_labels = num_labels
-        self.scope = scope
-        self.num_stages = len(hidden_sizes)
-        self.out_features = out_features
-        self.out_indices = out_indices
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return ResNetConfig(
-            num_channels=self.num_channels,
-            embeddings_size=self.embeddings_size,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            hidden_act=self.hidden_act,
-            num_labels=self.num_labels,
-            out_features=self.out_features,
-            out_indices=self.out_indices,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = ResNetModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        # expected last hidden states: B, C, H // 32, W // 32
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_sizes[-1], self.image_size // 32, self.image_size // 32),
-        )
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = ResNetForImageClassification(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = ResNetBackbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 4, 4])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-        self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = ResNetBackbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[-1], 1, 1])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-        self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ResNetModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ResNet does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            ResNetModel,
-            ResNetForImageClassification,
-            ResNetBackbone,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"image-feature-extraction": ResNetModel, "image-classification": ResNetForImageClassification}
-        if is_mindspore_available()
-        else {}
-    )
-
-    fx_compatible = True
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = ResNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ResNetConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.create_and_test_config_common_properties()
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def create_and_test_config_common_properties(self):
-        return
-
-    @unittest.skip(reason="ResNet does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="ResNet does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_backbone(*config_and_inputs)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            for name, module in model.cells_and_names():
-                if isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
-                    self.assertTrue(
-                        ops.all(module.weight == 1),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-                    self.assertTrue(
-                        ops.all(module.bias == 0),
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_stages = self.model_tester.num_stages
-            self.assertEqual(len(hidden_states), expected_num_stages + 1)
-
-            # ResNet's feature maps are of shape (batch_size, num_channels, height, width)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        layers_type = ["basic", "bottleneck"]
-        for model_class in self.all_model_classes:
-            for layer_type in layers_type:
-                config.layer_type = layer_type
-                inputs_dict["output_hidden_states"] = True
-                check_hidden_states_output(inputs_dict, config, model_class)
-
-                # check that output_hidden_states also work using config
-                del inputs_dict["output_hidden_states"]
-                config.output_hidden_states = True
-
-                check_hidden_states_output(inputs_dict, config, model_class)
-
-    @unittest.skip(reason="ResNet does not use feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/resnet-50"
-        model = ResNetModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class ResNetModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("microsoft/resnet-50") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-11.1069, -9.7877, -8.3777])
-        print(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy())
-        self.assertTrue(np.allclose(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-
-@require_mindspore
-class ResNetBackboneTest(BackboneTesterMixin, unittest.TestCase):
-    all_model_classes = (ResNetBackbone,) if is_mindspore_available() else ()
-    has_attentions = False
-    config_class = ResNetConfig
-
-    def setUp(self):
-        self.model_tester = ResNetModelTester(self)
\ No newline at end of file
diff --git a/tests/transformers/models/roberta/__init__.py b/tests/transformers/models/roberta/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/roberta/test_modeling_roberta.py b/tests/transformers/models/roberta/test_modeling_roberta.py
deleted file mode 100644
index 4c299b8d6..000000000
--- a/tests/transformers/models/roberta/test_modeling_roberta.py
+++ /dev/null
@@ -1,570 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-# Copyright 2022 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Test Roberta"""
-import pytest
-import numpy as np
-
-from mindnlp.transformers import RobertaConfig
-from mindnlp.utils import is_mindspore_available, require_mindspore
-from mindnlp.utils.testing_utils import TestCasePlus, slow
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        RobertaForCausalLM,
-        RobertaForMaskedLM,
-        RobertaForMultipleChoice,
-        RobertaForQuestionAnswering,
-        RobertaForSequenceClassification,
-        RobertaForTokenClassification,
-        RobertaModel,
-    )
-    from mindnlp.transformers.models.roberta.modeling_roberta import (
-        ROBERTA_SUPPORT_LIST,
-        RobertaEmbeddings,
-        create_position_ids_from_input_ids,
-    )
-
-ROBERTA_TINY = "sshleifer/tiny-distilroberta-base"
-
-class RobertaModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return RobertaConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = RobertaModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = RobertaModel(config)
-        
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = RobertaForCausalLM(config=config)
-        
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = RobertaForCausalLM(config=config).set_train(False)
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(),
-                                           atol=1e-3))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = RobertaForMaskedLM(config=config)
-        
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = RobertaForTokenClassification(config=config)
-        
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = RobertaForMultipleChoice(config=config)
-        
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = RobertaForQuestionAnswering(config=config)
-        
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin):
-    all_model_classes = (
-        (
-            RobertaForCausalLM,
-            RobertaForMaskedLM,
-            RobertaForSequenceClassification,
-            RobertaForTokenClassification,
-            RobertaForMultipleChoice,
-            RobertaForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (RobertaForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": RobertaModel,
-            "fill-mask": RobertaForMaskedLM,
-            "question-answering": RobertaForQuestionAnswering,
-            "text-classification": RobertaForSequenceClassification,
-            "text-generation": RobertaForCausalLM,
-            "token-classification": RobertaForTokenClassification,
-            "zero-shot": RobertaForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = True
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = RobertaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs_relative_pos_emb(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        config_and_inputs[0].position_embedding_type = "relative_key"
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in ROBERTA_SUPPORT_LIST[:1]:
-            model = RobertaModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_create_position_ids_respects_padding_index(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = RobertaEmbeddings(config=config)
-
-        input_ids = mindspore.Tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = mindspore.Tensor(
-            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-        )
-
-        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(ops.all(ops.eq(position_ids, expected_positions)))
-
-    def test_create_position_ids_from_inputs_embeds(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = RobertaEmbeddings(config=config)
-
-        inputs_embeds = ops.randn(2, 4, 30)
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = mindspore.Tensor([expected_single_positions, expected_single_positions])
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(ops.all(ops.eq(position_ids, expected_positions)))
-
-
-@require_mindspore
-class RobertaModelIntegrationTest(TestCasePlus):
-
-    @slow
-    def test_inference_masked_lm(self):
-        model = RobertaForMaskedLM.from_pretrained("roberta-base")
-
-        input_ids = mindspore.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = (1, 11, 50265)
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = mindspore.tensor(
-            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
-        )
-        self.assertTrue(np.allclose(output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-3))
-
-
-    @slow
-    def test_inference_no_head(self):
-        model = RobertaModel.from_pretrained("roberta-base")
-
-        input_ids = mindspore.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = mindspore.tensor(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
-        )
-
-        self.assertTrue(np.allclose(output[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-
-    @slow
-    def test_inference_classification_head(self):
-        model = RobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
-
-        input_ids = mindspore.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = (1, 3)
-        self.assertEqual(output.shape, expected_shape)
-        expected_tensor = mindspore.tensor([[-0.9469, 0.3913, 0.5118]])
-
-        self.assertTrue(np.allclose(output.asnumpy(), expected_tensor.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/roberta_prelayernorm/__init__.py b/tests/transformers/models/roberta_prelayernorm/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py b/tests/transformers/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
deleted file mode 100644
index eb591e95a..000000000
--- a/tests/transformers/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
+++ /dev/null
@@ -1,560 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from mindnlp.transformers import RobertaPreLayerNormConfig
-from mindnlp.utils.testing_utils import is_mindspore_available, require_mindspore, slow, TestCasePlus
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        RobertaPreLayerNormForCausalLM,
-        RobertaPreLayerNormForMaskedLM,
-        RobertaPreLayerNormForMultipleChoice,
-        RobertaPreLayerNormForQuestionAnswering,
-        RobertaPreLayerNormForSequenceClassification,
-        RobertaPreLayerNormForTokenClassification,
-        RobertaPreLayerNormModel,
-    )
-    from mindnlp.transformers.models.roberta_prelayernorm.modeling_roberta_prelayernorm import (
-        RobertaPreLayerNormEmbeddings,
-        create_position_ids_from_input_ids,
-    )
-
-
-# Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTester with Roberta->RobertaPreLayerNorm
-class RobertaPreLayerNormModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return RobertaPreLayerNormConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = RobertaPreLayerNormModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = RobertaPreLayerNormModel(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = RobertaPreLayerNormForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = RobertaPreLayerNormForCausalLM(config=config).eval()
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = RobertaPreLayerNormForMaskedLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = RobertaPreLayerNormForTokenClassification(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = RobertaPreLayerNormForMultipleChoice(config=config)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = RobertaPreLayerNormForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class RobertaPreLayerNormModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            RobertaPreLayerNormForCausalLM,
-            RobertaPreLayerNormForMaskedLM,
-            RobertaPreLayerNormForSequenceClassification,
-            RobertaPreLayerNormForTokenClassification,
-            RobertaPreLayerNormForMultipleChoice,
-            RobertaPreLayerNormForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (RobertaPreLayerNormForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": RobertaPreLayerNormModel,
-            "fill-mask": RobertaPreLayerNormForMaskedLM,
-            "question-answering": RobertaPreLayerNormForQuestionAnswering,
-            "text-classification": RobertaPreLayerNormForSequenceClassification,
-            "text-generation": RobertaPreLayerNormForCausalLM,
-            "token-classification": RobertaPreLayerNormForTokenClassification,
-            "zero-shot": RobertaPreLayerNormForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = False
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.setUp with Roberta->RobertaPreLayerNorm
-    def setUp(self):
-        self.model_tester = RobertaPreLayerNormModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RobertaPreLayerNormConfig, hidden_size=37)
-
-    # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_config
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_model
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_model_various_embeddings
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_model_as_decoder
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_model_as_decoder_with_default_input_mask
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_for_causal_lm
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_decoder_model_past_with_large_inputs
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_for_masked_lm
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_for_token_classification
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_for_multiple_choice
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_for_question_answering
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "andreasmadsen/efficient_mlm_m0.15"
-        model = RobertaPreLayerNormModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_create_position_ids_respects_padding_index with Roberta->RobertaPreLayerNorm
-    def test_create_position_ids_respects_padding_index(self):
-        """This is a regression test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is RobertaPreLayerNormEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = RobertaPreLayerNormEmbeddings(config=config)
-
-        input_ids = mindspore.Tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = mindspore.Tensor(
-            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-        )
-
-        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(ops.all(ops.eq(position_ids, expected_positions)))
-
-    # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_create_position_ids_from_inputs_embeds with Roberta->RobertaPreLayerNorm
-    def test_create_position_ids_from_inputs_embeds(self):
-        """This is a regression test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is RobertaPreLayerNormEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = RobertaPreLayerNormEmbeddings(config=config)
-
-        inputs_embeds = ops.empty(2, 4, 30)
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = mindspore.Tensor([expected_single_positions, expected_single_positions])
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(ops.all(ops.eq(position_ids, expected_positions)))
-
-
-@require_mindspore
-class RobertaPreLayerNormModelIntegrationTest(TestCasePlus):
-    @slow
-    def test_inference_masked_lm(self):
-        model = RobertaPreLayerNormForMaskedLM.from_pretrained("andreasmadsen/efficient_mlm_m0.40")
-
-        input_ids = mindspore.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        expected_shape = (1, 11, 50265)
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        # newer pytorch and huggingface use replace not assign where param load,
-        # therefore the result is different.
-        # EXPECTED_SLICE = mindspore.Tensor(
-        #     [[[40.4880, 18.0199, -5.2367], [-1.8877, -4.0885, 10.7085], [-2.2613, -5.6110, 7.2665]]]
-        # )
-        EXPECTED_SLICE = mindspore.Tensor(
-            [[[43.7966, 18.3873, -5.4877],
-              [ 1.4209, -3.7211, 10.4575],
-              [ 1.0473, -5.2436,  7.0155]]]
-        )
-        print("output[:, :3, :3]***************", output[:, :3, :3])
-
-        self.assertTrue(ops.allclose(output[:, :3, :3], EXPECTED_SLICE, atol=1e-4))
-
-    @slow
-    def test_inference_no_head(self):
-        model = RobertaPreLayerNormModel.from_pretrained("andreasmadsen/efficient_mlm_m0.40")
-
-        input_ids = mindspore.Tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        EXPECTED_SLICE = mindspore.Tensor(
-            [[[0.0208, -0.0356, 0.0237], [-0.1569, -0.0411, -0.2626], [0.1879, 0.0125, -0.0089]]]
-        )
-
-        self.assertTrue(ops.allclose(output[:, :3, :3], EXPECTED_SLICE, atol=1e-4))
diff --git a/tests/transformers/models/roc_bert/__init__.py b/tests/transformers/models/roc_bert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/roc_bert/test_modeling_roc_bert.py b/tests/transformers/models/roc_bert/test_modeling_roc_bert.py
deleted file mode 100644
index 1caf117c4..000000000
--- a/tests/transformers/models/roc_bert/test_modeling_roc_bert.py
+++ /dev/null
@@ -1,828 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore RoCBert model."""
-
-import unittest
-
-import numpy as np
-from mindspore import ops
-
-from mindnlp.transformers import RoCBertConfig
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import (
-        MODEL_FOR_PRETRAINING_MAPPING,
-        RoCBertForCausalLM,
-        RoCBertForMaskedLM,
-        RoCBertForMultipleChoice,
-        RoCBertForPreTraining,
-        RoCBertForQuestionAnswering,
-        RoCBertForSequenceClassification,
-        RoCBertForTokenClassification,
-        RoCBertModel,
-    )
-
-
-class RoCBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        pronunciation_vocab_size=99,
-        shape_vocab_size=99,
-        pronunciation_embed_dim=32,
-        shape_embed_dim=32,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.pronunciation_vocab_size = pronunciation_vocab_size
-        self.shape_vocab_size = shape_vocab_size
-        self.pronunciation_embed_dim = pronunciation_embed_dim
-        self.shape_embed_dim = shape_embed_dim
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_shape_ids = ids_tensor(
-            [self.batch_size, self.seq_length], self.shape_vocab_size
-        )
-        input_pronunciation_ids = ids_tensor(
-            [self.batch_size, self.seq_length], self.pronunciation_vocab_size
-        )
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor(
-                [self.batch_size, self.seq_length], self.type_vocab_size
-            )
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor(
-                [self.batch_size], self.type_sequence_label_size
-            )
-            token_labels = ids_tensor(
-                [self.batch_size, self.seq_length], self.num_labels
-            )
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            input_shape_ids,
-            input_pronunciation_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return RoCBertConfig(
-            vocab_size=self.vocab_size,
-            shape_vocab_size=self.shape_vocab_size,
-            pronunciation_vocab_size=self.pronunciation_vocab_size,
-            shape_embed_dim=self.shape_embed_dim,
-            pronunciation_embed_dim=self.pronunciation_embed_dim,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            input_shape_ids,
-            input_pronunciation_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor(
-            [self.batch_size, self.seq_length, self.hidden_size]
-        )
-        encoder_attention_mask = ids_tensor(
-            [self.batch_size, self.seq_length], vocab_size=2
-        )
-
-        return (
-            config,
-            input_ids,
-            input_shape_ids,
-            input_pronunciation_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        input_shape_ids,
-        input_pronunciation_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RoCBertModel(config=config)
-        result = model(
-            input_ids,
-            input_shape_ids=input_shape_ids,
-            input_pronunciation_ids=input_pronunciation_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-        )
-        result = model(
-            input_ids,
-            input_shape_ids=input_shape_ids,
-            input_pronunciation_ids=input_pronunciation_ids,
-            token_type_ids=token_type_ids,
-        )
-        result = model(
-            input_ids,
-            input_shape_ids=input_shape_ids,
-            input_pronunciation_ids=input_pronunciation_ids,
-        )
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        input_shape_ids,
-        input_pronunciation_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = RoCBertModel(config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            input_shape_ids=input_shape_ids,
-            input_pronunciation_ids=input_pronunciation_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            input_shape_ids=input_shape_ids,
-            input_pronunciation_ids=input_pronunciation_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(
-            input_ids,
-            input_shape_ids=input_shape_ids,
-            input_pronunciation_ids=input_pronunciation_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-        )
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        input_shape_ids,
-        input_pronunciation_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = RoCBertForCausalLM(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            input_shape_ids=input_shape_ids,
-            input_pronunciation_ids=input_pronunciation_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)
-        )
-
-    def create_and_check_for_masked_lm(
-        self,
-        config,
-        input_ids,
-        input_shape_ids,
-        input_pronunciation_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RoCBertForMaskedLM(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            input_shape_ids=input_shape_ids,
-            input_pronunciation_ids=input_pronunciation_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)
-        )
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        input_shape_ids,
-        input_pronunciation_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = RoCBertForCausalLM(config=config)
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            input_shape_ids=input_shape_ids,
-            input_pronunciation_ids=input_pronunciation_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_shape_tokens = ids_tensor((self.batch_size, 3), config.shape_vocab_size)
-        next_pronunciation_tokens = ids_tensor(
-            (self.batch_size, 3), config.pronunciation_vocab_size
-        )
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_input_shape_ids = ops.cat([input_shape_ids, next_shape_tokens], axis=-1)
-        next_input_pronunciation_ids = ops.cat(
-            [input_pronunciation_ids, next_pronunciation_tokens], axis=-1
-        )
-        next_attention_mask = ops.cat([input_mask, next_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            input_shape_ids=next_input_shape_ids,
-            input_pronunciation_ids=next_input_pronunciation_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            input_shape_ids=next_shape_tokens,
-            input_pronunciation_ids=next_pronunciation_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(
-            np.allclose(
-                output_from_past_slice.asnumpy(),
-                output_from_no_past_slice.asnumpy(),
-                atol=1e-3,
-            )
-        )
-
-    def create_and_check_for_question_answering(
-        self,
-        config,
-        input_ids,
-        input_shape_ids,
-        input_pronunciation_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RoCBertForQuestionAnswering(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            input_shape_ids=input_shape_ids,
-            input_pronunciation_ids=input_pronunciation_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(
-            result.start_logits.shape, (self.batch_size, self.seq_length)
-        )
-        self.parent.assertEqual(
-            result.end_logits.shape, (self.batch_size, self.seq_length)
-        )
-
-    def create_and_check_for_sequence_classification(
-        self,
-        config,
-        input_ids,
-        input_shape_ids,
-        input_pronunciation_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = RoCBertForSequenceClassification(config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            input_shape_ids=input_shape_ids,
-            input_pronunciation_ids=input_pronunciation_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-        self,
-        config,
-        input_ids,
-        input_shape_ids,
-        input_pronunciation_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = RoCBertForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            input_shape_ids=input_shape_ids,
-            input_pronunciation_ids=input_pronunciation_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)
-        )
-
-    def create_and_check_for_multiple_choice(
-        self,
-        config,
-        input_ids,
-        input_shape_ids,
-        input_pronunciation_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_choices = self.num_choices
-        model = RoCBertForMultipleChoice(config=config)
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to(
-            (-1, self.num_choices, -1)
-        )
-        multiple_choice_inputs_shape_ids = input_shape_ids.unsqueeze(1).broadcast_to(
-            (-1, self.num_choices, -1)
-        )
-        multiple_choice_inputs_pronunciation_ids = input_pronunciation_ids.unsqueeze(
-            1
-        ).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to(
-            (-1, self.num_choices, -1)
-        )
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to(
-            (-1, self.num_choices, -1)
-        )
-        result = model(
-            multiple_choice_inputs_ids,
-            input_shape_ids=multiple_choice_inputs_shape_ids,
-            input_pronunciation_ids=multiple_choice_inputs_pronunciation_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_choices)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_shape_ids,
-            input_pronunciation_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "input_shape_ids": input_shape_ids,
-            "input_pronunciation_ids": input_pronunciation_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-    def create_and_check_for_pretraining(
-        self,
-        config,
-        input_ids,
-        input_shape_ids,
-        input_pronunciation_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RoCBertForPreTraining(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            input_shape_ids,
-            input_pronunciation_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            attack_input_ids=input_ids,
-            attack_input_shape_ids=input_shape_ids,
-            attack_input_pronunciation_ids=input_pronunciation_ids,
-            attack_attention_mask=input_mask,
-            attack_token_type_ids=token_type_ids,
-            labels_input_ids=token_labels,
-            labels_input_shape_ids=input_shape_ids,
-            labels_input_pronunciation_ids=input_pronunciation_ids,
-            labels_attention_mask=input_mask,
-            labels_token_type_ids=token_type_ids,
-        )
-        self.parent.assertEqual(
-            result.logits.shape,
-            (self.batch_size, self.seq_length, self.vocab_size),
-        )
-
-
-@require_mindspore
-class RoCBertModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            # RoCBertModel,
-            RoCBertForMaskedLM,
-            RoCBertForCausalLM,
-            RoCBertForMultipleChoice,
-            RoCBertForQuestionAnswering,
-            RoCBertForSequenceClassification,
-            RoCBertForTokenClassification,
-            RoCBertForPreTraining,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (RoCBertForCausalLM,) if is_mindspore_available() else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": RoCBertModel,
-            "fill-mask": RoCBertForMaskedLM,
-            "question-answering": RoCBertForQuestionAnswering,
-            "text-classification": RoCBertForSequenceClassification,
-            "text-generation": RoCBertForCausalLM,
-            "token-classification": RoCBertForTokenClassification,
-            "zero-shot": RoCBertForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_casse_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        processor_name,
-    ):
-        if pipeline_test_casse_name in [
-            "FillMaskPipelineTests",
-            "FeatureExtractionPipelineTests",
-            "TextClassificationPipelineTests",
-            "TokenClassificationPipelineTests",
-        ]:
-            # Get error: IndexError: index out of range in self.
-            # `word_shape_file` and `word_pronunciation_file` should be shrunk during tiny model creation,
-            # otherwise `IndexError` could occur in some embedding layers. Skip for now until this model has
-            # more usage.
-            return True
-
-        return False
-
-    # special case for ForPreTraining model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(
-            inputs_dict, model_class, return_labels=return_labels
-        )
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
-                inputs_dict["labels_input_ids"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=mindspore.int64,
-                )
-                inputs_dict["labels_input_shape_ids"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=mindspore.int64,
-                )
-                inputs_dict["labels_input_pronunciation_ids"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=mindspore.int64,
-                )
-                inputs_dict["attack_input_ids"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=mindspore.int64,
-                )
-                inputs_dict["attack_input_shape_ids"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=mindspore.int64,
-                )
-                inputs_dict["attack_input_pronunciation_ids"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=mindspore.int64,
-                )
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = RoCBertModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=RoCBertConfig, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(
-            *config_and_inputs
-        )
-
-    def test_decoder_model_past_with_large_inputs_relative_pos_emb(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        config_and_inputs[0].position_embedding_type = "relative_key"
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(
-            *config_and_inputs
-        )
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(
-            *config_and_inputs
-        )
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            input_shape_ids,
-            input_pronunciation_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            input_shape_ids,
-            input_pronunciation_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "weiweishi/roc-bert-base-zh"
-        model = RoCBertModel.from_pretrained(model_name, from_pt=True)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class RoCBertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_masked_lm(self):
-        model = RoCBertForMaskedLM.from_pretrained("weiweishi/roc-bert-base-zh")
-        model.set_train(False)
-        # input_text: ['[CLS]', 'b', 'a', '里', '系', '[MASK]', '国', '的', '首', '都', '[SEP]'] is the adversarial text
-        # of ['[CLS]', '巴', '黎', '是', '[MASK]', '国', '的', '首', '都', '[SEP]'], means
-        # "Paris is the [MASK] of France" in English
-        input_ids = mindspore.tensor(
-            [[101, 144, 143, 7027, 5143, 103, 1744, 4638, 7674, 6963, 102]]
-        )
-        input_shape_ids = mindspore.tensor(
-            [[2, 20324, 23690, 8740, 706, 1, 10900, 23343, 20205, 5850, 2]]
-        )
-        input_pronunciation_ids = mindspore.tensor(
-            [[2, 718, 397, 52, 61, 1, 168, 273, 180, 243, 2]]
-        )
-
-        output = model(input_ids, input_shape_ids, input_pronunciation_ids)
-        output_ids = np.argmax(output.logits.asnumpy(), axis=2)
-
-        # convert to tokens is: ['[CLS]', '巴', '*', '黎', '是', '法', '国', '的', '首', '都', '[SEP]']
-        expected_output = np.array(
-            [[101, 2349, 115, 7944, 3221, 3791, 1744, 4638, 7674, 6963, 102]]
-        )
-
-        assert np.allclose(output_ids, expected_output)
diff --git a/tests/transformers/models/rwkv/__init__.py b/tests/transformers/models/rwkv/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/rwkv/test_modeling_rwkv.py b/tests/transformers/models/rwkv/test_modeling_rwkv.py
deleted file mode 100644
index 63b326821..000000000
--- a/tests/transformers/models/rwkv/test_modeling_rwkv.py
+++ /dev/null
@@ -1,485 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-from unittest.util import safe_repr
-
-from mindnlp.transformers import AutoTokenizer, RwkvConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import (
-        RwkvForCausalLM,
-        RwkvModel,
-    )
-
-
-class RwkvModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        is_training=True,
-        use_token_type_ids=False,
-        use_input_mask=True,
-        use_labels=True,
-        use_mc_token_ids=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_token_type_ids = use_token_type_ids
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.use_mc_token_ids = use_mc_token_ids
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.bos_token_id = vocab_size - 1
-        self.eos_token_id = vocab_size - 1
-        self.pad_token_id = vocab_size - 1
-
-    def get_large_model_config(self):
-        return RwkvConfig.from_pretrained("sgugger/rwkv-4-pile-7b")
-
-    def prepare_config_and_inputs(
-        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
-    ):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        mc_token_ids = None
-        if self.use_mc_token_ids:
-            mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config(
-            gradient_checkpointing=gradient_checkpointing,
-            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-            reorder_and_upcast_attn=reorder_and_upcast_attn,
-        )
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            None,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(
-        self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
-    ):
-        return RwkvConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            intermediate_size=self.intermediate_size,
-            activation_function=self.hidden_act,
-            resid_pdrop=self.hidden_dropout_prob,
-            attn_pdrop=self.attention_probs_dropout_prob,
-            n_positions=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            use_cache=True,
-            bos_token_id=self.bos_token_id,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-            gradient_checkpointing=gradient_checkpointing,
-            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-            reorder_and_upcast_attn=reorder_and_upcast_attn,
-        )
-
-    def get_pipeline_config(self):
-        config = self.get_config()
-        config.vocab_size = 300
-        return config
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_rwkv_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        config.output_hidden_states = True
-        model = RwkvModel(config=config)
-        model.eval()
-
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(len(result.hidden_states), config.num_hidden_layers + 1)
-
-    def create_and_check_causl_lm(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = RwkvForCausalLM(config)
-        model.eval()
-
-        result = model(input_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_state_equivalency(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-        model = RwkvModel(config=config)
-        model.eval()
-
-        outputs = model(input_ids)
-        output_whole = outputs.last_hidden_state
-
-        outputs = model(input_ids[:, :2])
-        output_one = outputs.last_hidden_state
-
-        # Using the state computed on the first inputs, we will get the same output
-        outputs = model(input_ids[:, 2:], state=outputs.state)
-        output_two = outputs.last_hidden_state
-
-        self.parent.assertTrue(ops.allclose(ops.cat([output_one, output_two], dim=1), output_whole, atol=1e-5))
-
-    def create_and_check_forward_and_backwards(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
-    ):
-        model = RwkvForCausalLM(config)
-        if gradient_checkpointing:
-            model.gradient_checkpointing_enable()
-
-        result = model(input_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        result.loss.backward()
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-
-        (
-            config,
-            input_ids,
-            input_mask,
-            head_mask,
-            token_type_ids,
-            mc_token_ids,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {"input_ids": input_ids}
-
-        return config, inputs_dict
-
-
-@require_mindspore
-class RwkvModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (RwkvModel, RwkvForCausalLM) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": RwkvModel, "text-generation": RwkvForCausalLM} if is_mindspore_available() else {}
-    )
-    all_generative_model_classes = (RwkvForCausalLM,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_missing_keys = False
-    test_model_parallel = False
-    test_pruning = False
-    test_head_masking = False  # Rwkv does not support head masking
-
-    def setUp(self):
-        self.model_tester = RwkvModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=RwkvConfig, n_embd=37, common_properties=["hidden_size", "num_hidden_layers"]
-        )
-
-    def assertInterval(self, member, container, msg=None):
-        r"""
-        Simple utility function to check if a member is inside an interval.
-        """
-        if isinstance(member, mindspore.Tensor):
-            max_value, min_value = member.max().item(), member.min().item()
-        elif isinstance(member, list) or isinstance(member, tuple):
-            max_value, min_value = max(member), min(member)
-
-        if not isinstance(container, list):
-            raise TypeError("container should be a list or tuple")
-        elif len(container) != 2:
-            raise ValueError("container should have 2 elements")
-
-        expected_min, expected_max = container
-
-        is_inside_interval = (min_value >= expected_min) and (max_value <= expected_max)
-
-        if not is_inside_interval:
-            standardMsg = "%s not found in %s" % (safe_repr(member), safe_repr(container))
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_rwkv_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_rwkv_model(*config_and_inputs)
-
-    def test_rwkv_lm_head_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_causl_lm(*config_and_inputs)
-
-    def test_state_equivalency(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_state_equivalency(*config_and_inputs)
-
-    def test_initialization(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config=config)
-            for name, param in model.named_parameters():
-                if "time_decay" in name:
-                    if param.requires_grad:
-                        self.assertTrue(param.data.max().item() == 3.0)
-                        self.assertTrue(param.data.min().item() == -5.0)
-                elif "time_first" in name:
-                    if param.requires_grad:
-                        # check if it's a ones like
-                        self.assertTrue(ops.allclose(param.data, ops.ones_like(param.data), atol=1e-5, rtol=1e-5))
-                elif any(x in name for x in ["time_mix_key", "time_mix_receptance"]):
-                    if param.requires_grad:
-                        self.assertInterval(
-                            param.data,
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                elif "time_mix_value" in name:
-                    if param.requires_grad:
-                        self.assertInterval(
-                            param.data,
-                            [0.0, 1.3],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def test_attention_outputs(self):
-        r"""
-        Overriding the test_attention_outputs test as the attention outputs of Rwkv are different from other models
-        it has a shape `batch_size, seq_len, hidden_size`.
-        """
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            batch_size = inputs["input_ids"].shape[0]
-            with no_grad():
-                outputs = model(**inputs)
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            batch_size = inputs["input_ids"].shape[0]
-            with no_grad():
-                outputs = model(**inputs)
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [batch_size, seq_len, config.hidden_size],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            batch_size = inputs["input_ids"].shape[0]
-            with no_grad():
-                outputs = model(**inputs)
-
-            added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [batch_size, seq_len, config.hidden_size],
-            )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "RWKV/rwkv-4-169m-pile"
-        model = RwkvModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_beam_sample_generate_dict_output(self):
-        # This model has a custom attention output shape AND config flags, let's skip those checks
-        old_has_attentions = self.has_attentions
-        self.has_attentions = False
-        super().test_beam_sample_generate_dict_output()
-        self.has_attentions = old_has_attentions
-
-    def test_beam_search_generate_dict_output(self):
-        # This model has a custom attention output shape AND config flags, let's skip those checks
-        old_has_attentions = self.has_attentions
-        self.has_attentions = False
-        super().test_beam_search_generate_dict_output()
-        self.has_attentions = old_has_attentions
-
-    def test_constrained_beam_search_generate_dict_output(self):
-        # This model has a custom attention output shape AND config flags, let's skip those checks
-        old_has_attentions = self.has_attentions
-        self.has_attentions = False
-        super().test_constrained_beam_search_generate_dict_output()
-        self.has_attentions = old_has_attentions
-
-    def test_greedy_generate_dict_outputs(self):
-        # This model has a custom attention output shape AND config flags, let's skip those checks
-        old_has_attentions = self.has_attentions
-        self.has_attentions = False
-        super().test_greedy_generate_dict_outputs()
-        self.has_attentions = old_has_attentions
-
-    def test_group_beam_search_generate_dict_output(self):
-        # This model has a custom attention output shape AND config flags, let's skip those checks
-        old_has_attentions = self.has_attentions
-        self.has_attentions = False
-        super().test_group_beam_search_generate_dict_output()
-        self.has_attentions = old_has_attentions
-
-    def test_sample_generate_dict_output(self):
-        # This model has a custom attention output shape AND config flags, let's skip those checks
-        old_has_attentions = self.has_attentions
-        self.has_attentions = False
-        super().test_sample_generate_dict_output()
-        self.has_attentions = old_has_attentions
-
-    @unittest.skip("This model doesn't support padding")
-    def test_left_padding_compatibility(self):
-        pass
-
-
-@slow
-class RWKVIntegrationTests(unittest.TestCase):
-    def setUp(self):
-        self.model_id = "RWKV/rwkv-4-169m-pile"
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
-
-    def test_simple_generate(self):
-        expected_output = "Hello my name is Jasmine and I am a newbie to the"
-        model = RwkvForCausalLM.from_pretrained(self.model_id)
-
-        input_ids = self.tokenizer("Hello my name is", return_tensors="ms").input_ids
-        output = model.generate(input_ids, max_new_tokens=10)
-        output_sentence = self.tokenizer.decode(output[0].tolist())
-
-        self.assertEqual(output_sentence, expected_output)
-
-    def test_simple_generate_bf16(self):
-        expected_output = "Hello my name is Jasmine and I am a newbie to the"
-
-        input_ids = self.tokenizer("Hello my name is", return_tensors="ms").input_ids
-        model = RwkvForCausalLM.from_pretrained(self.model_id, ms_dtype=mindspore.bfloat16)
-
-        output = model.generate(input_ids, max_new_tokens=10)
-        output_sentence = self.tokenizer.decode(output[0].tolist())
-
-        self.assertEqual(output_sentence, expected_output)
\ No newline at end of file
diff --git a/tests/transformers/models/sam/__init__.py b/tests/transformers/models/sam/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/sam/test_modeling_sam.py b/tests/transformers/models/sam/test_modeling_sam.py
deleted file mode 100644
index 2760429e6..000000000
--- a/tests/transformers/models/sam/test_modeling_sam.py
+++ /dev/null
@@ -1,736 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore SAM model."""
-
-import gc
-import unittest
-
-import requests
-
-from mindnlp.transformers import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig, pipeline
-from mindnlp.utils.testing_utils import require_mindspore, slow
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import SamModel, SamProcessor
-
-
-if is_vision_available():
-    from PIL import Image
-
-
-class SamPromptEncoderTester:
-    def __init__(
-        self,
-        hidden_size=32,
-        input_image_size=24,
-        patch_size=2,
-        mask_input_channels=4,
-        num_point_embeddings=4,
-        hidden_act="gelu",
-    ):
-        self.hidden_size = hidden_size
-        self.input_image_size = input_image_size
-        self.patch_size = patch_size
-        self.mask_input_channels = mask_input_channels
-        self.num_point_embeddings = num_point_embeddings
-        self.hidden_act = hidden_act
-
-    def get_config(self):
-        return SamPromptEncoderConfig(
-            image_size=self.input_image_size,
-            patch_size=self.patch_size,
-            mask_input_channels=self.mask_input_channels,
-            hidden_size=self.hidden_size,
-            num_point_embeddings=self.num_point_embeddings,
-            hidden_act=self.hidden_act,
-        )
-
-    def prepare_config_and_inputs(self):
-        dummy_points = floats_tensor([self.batch_size, 3, 2])
-        config = self.get_config()
-
-        return config, dummy_points
-
-
-class SamMaskDecoderTester:
-    def __init__(
-        self,
-        hidden_size=32,
-        hidden_act="relu",
-        mlp_dim=64,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        attention_downsample_rate=2,
-        num_multimask_outputs=3,
-        iou_head_depth=3,
-        iou_head_hidden_dim=32,
-        layer_norm_eps=1e-6,
-    ):
-        self.hidden_size = hidden_size
-        self.hidden_act = hidden_act
-        self.mlp_dim = mlp_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.attention_downsample_rate = attention_downsample_rate
-        self.num_multimask_outputs = num_multimask_outputs
-        self.iou_head_depth = iou_head_depth
-        self.iou_head_hidden_dim = iou_head_hidden_dim
-        self.layer_norm_eps = layer_norm_eps
-
-    def get_config(self):
-        return SamMaskDecoderConfig(
-            hidden_size=self.hidden_size,
-            hidden_act=self.hidden_act,
-            mlp_dim=self.mlp_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            attention_downsample_rate=self.attention_downsample_rate,
-            num_multimask_outputs=self.num_multimask_outputs,
-            iou_head_depth=self.iou_head_depth,
-            iou_head_hidden_dim=self.iou_head_hidden_dim,
-            layer_norm_eps=self.layer_norm_eps,
-        )
-
-    def prepare_config_and_inputs(self):
-        config = self.get_config()
-
-        dummy_inputs = {
-            "image_embedding": floats_tensor([self.batch_size, self.hidden_size]),
-        }
-
-        return config, dummy_inputs
-
-
-class SamModelTester:
-    def __init__(
-        self,
-        parent,
-        hidden_size=36,
-        intermediate_size=72,
-        projection_dim=62,
-        output_channels=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        num_channels=3,
-        image_size=24,
-        patch_size=2,
-        hidden_act="gelu",
-        layer_norm_eps=1e-06,
-        dropout=0.0,
-        attention_dropout=0.0,
-        initializer_range=0.02,
-        initializer_factor=1.0,
-        qkv_bias=True,
-        mlp_ratio=4.0,
-        use_abs_pos=True,
-        use_rel_pos=True,
-        rel_pos_zero_init=False,
-        window_size=14,
-        global_attn_indexes=[2, 5, 8, 11],
-        num_pos_feats=16,
-        mlp_dim=None,
-        batch_size=2,
-    ):
-        self.parent = parent
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.output_channels = output_channels
-        self.num_channels = num_channels
-        self.hidden_size = hidden_size
-        self.projection_dim = projection_dim
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.initializer_factor = initializer_factor
-        self.hidden_act = hidden_act
-        self.layer_norm_eps = layer_norm_eps
-        self.qkv_bias = qkv_bias
-        self.mlp_ratio = mlp_ratio
-        self.use_abs_pos = use_abs_pos
-        self.use_rel_pos = use_rel_pos
-        self.rel_pos_zero_init = rel_pos_zero_init
-        self.window_size = window_size
-        self.global_attn_indexes = global_attn_indexes
-        self.num_pos_feats = num_pos_feats
-        self.mlp_dim = mlp_dim
-        self.batch_size = batch_size
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-        self.prompt_encoder_tester = SamPromptEncoderTester()
-        self.mask_decoder_tester = SamMaskDecoderTester()
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        vision_config = SamVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            projection_dim=self.projection_dim,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-            initializer_factor=self.initializer_factor,
-            output_channels=self.output_channels,
-            qkv_bias=self.qkv_bias,
-            mlp_ratio=self.mlp_ratio,
-            use_abs_pos=self.use_abs_pos,
-            use_rel_pos=self.use_rel_pos,
-            rel_pos_zero_init=self.rel_pos_zero_init,
-            window_size=self.window_size,
-            global_attn_indexes=self.global_attn_indexes,
-            num_pos_feats=self.num_pos_feats,
-            mlp_dim=self.mlp_dim,
-        )
-
-        prompt_encoder_config = self.prompt_encoder_tester.get_config()
-
-        mask_decoder_config = self.mask_decoder_tester.get_config()
-
-        return SamConfig(
-            vision_config=vision_config,
-            prompt_encoder_config=prompt_encoder_config,
-            mask_decoder_config=mask_decoder_config,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = SamModel(config=config)
-        model.eval()
-        with no_grad():
-            result = model(pixel_values)
-        self.parent.assertEqual(result.iou_scores.shape, (self.batch_size, 1, 3))
-        self.parent.assertEqual(result.pred_masks.shape[:3], (self.batch_size, 1, 3))
-
-    def create_and_check_get_image_features(self, config, pixel_values):
-        model = SamModel(config=config)
-        model.eval()
-        with no_grad():
-            result = model.get_image_embeddings(pixel_values)
-        self.parent.assertEqual(result[0].shape, (self.output_channels, 12, 12))
-
-    def create_and_check_get_image_hidden_states(self, config, pixel_values):
-        model = SamModel(config=config)
-        model.eval()
-        with no_grad():
-            result = model.vision_encoder(
-                pixel_values,
-                output_hidden_states=True,
-                return_dict=True,
-            )
-
-        # after computing the convolutional features
-        expected_hidden_states_shape = (self.batch_size, 12, 12, 36)
-        self.parent.assertEqual(len(result[1]), self.num_hidden_layers + 1)
-        self.parent.assertEqual(result[1][0].shape, expected_hidden_states_shape)
-
-        with no_grad():
-            result = model.vision_encoder(
-                pixel_values,
-                output_hidden_states=True,
-                return_dict=False,
-            )
-
-        # after computing the convolutional features
-        expected_hidden_states_shape = (self.batch_size, 12, 12, 36)
-        self.parent.assertEqual(len(result[1]), self.num_hidden_layers + 1)
-        self.parent.assertEqual(result[1][0].shape, expected_hidden_states_shape)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class SamModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as SAM's vision encoder does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (SamModel,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": SamModel, "mask-generation": SamModel} if is_mindspore_available() else {}
-    )
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_torchscript = False
-
-    # TODO: Fix me @Arthur: `run_batch_test` in `tests/test_pipeline_mixin.py` not working
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return True
-
-    def setUp(self):
-        self.model_tester = SamModelTester(self)
-        self.vision_config_tester = ConfigTester(self, config_class=SamVisionConfig, has_text_modality=False)
-        self.prompt_encoder_config_tester = ConfigTester(
-            self,
-            config_class=SamPromptEncoderConfig,
-            has_text_modality=False,
-            num_attention_heads=12,
-            num_hidden_layers=2,
-        )
-        self.mask_decoder_config_tester = ConfigTester(
-            self, config_class=SamMaskDecoderConfig, has_text_modality=False
-        )
-
-    def test_config(self):
-        self.vision_config_tester.run_common_tests()
-        self.prompt_encoder_config_tester.run_common_tests()
-        self.mask_decoder_config_tester.run_common_tests()
-
-    @unittest.skip(reason="SAM's vision encoder does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_get_image_features(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_get_image_features(*config_and_inputs)
-
-    def test_image_hidden_states(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_get_image_hidden_states(*config_and_inputs)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        expected_vision_attention_shape = (
-            self.model_tester.batch_size * self.model_tester.num_attention_heads,
-            196,
-            196,
-        )
-        expected_mask_decoder_attention_shape = (self.model_tester.batch_size, 1, 144, 32)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            vision_attentions = outputs.vision_attentions
-            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers)
-
-            mask_decoder_attentions = outputs.mask_decoder_attentions
-            self.assertEqual(len(mask_decoder_attentions), self.model_tester.mask_decoder_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            vision_attentions = outputs.vision_attentions
-            self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers)
-
-            mask_decoder_attentions = outputs.mask_decoder_attentions
-            self.assertEqual(len(mask_decoder_attentions), self.model_tester.mask_decoder_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(vision_attentions[0].shape[-4:]),
-                list(expected_vision_attention_shape),
-            )
-
-            self.assertListEqual(
-                list(mask_decoder_attentions[0].shape[-4:]),
-                list(expected_mask_decoder_attention_shape),
-            )
-
-    @unittest.skip(reason="SamModel does not support training")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="SamModel does not support training")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="SamModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="SamModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(reason="SamModel does not support training")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="Hidden_states is tested in create_and_check_model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/sam-vit-huge"
-        model = SamModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-def prepare_image():
-    img_url = "https://hf-mirror.com/ybelkada/segment-anything/resolve/main/assets/car.png"
-    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
-    return raw_image
-
-
-def prepare_dog_img():
-    img_url = "https://hf-mirror.com/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/dog-sam.png"
-    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
-    return raw_image
-
-
-@slow
-class SamModelIntegrationTest(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-
-    def test_inference_mask_generation_no_point(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        model.eval()
-
-        raw_image = prepare_image()
-        inputs = processor(images=raw_image, return_tensors="ms")
-
-        with no_grad():
-            outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
-        masks = outputs.pred_masks[0, 0, 0, 0, :3]
-        self.assertTrue(ops.allclose(scores[-1], mindspore.tensor(0.4515), atol=2e-4))
-        self.assertTrue(ops.allclose(masks, mindspore.tensor([-4.1800, -3.4948, -3.4481]), atol=2e-4))
-
-    def test_inference_mask_generation_one_point_one_bb(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        model.eval()
-
-        raw_image = prepare_image()
-        input_boxes = [[[650, 900, 1000, 1250]]]
-        input_points = [[[820, 1080]]]
-
-        inputs = processor(
-            images=raw_image, input_boxes=input_boxes, input_points=input_points, return_tensors="ms"
-        )
-
-        with no_grad():
-            outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
-        masks = outputs.pred_masks[0, 0, 0, 0, :3]
-        self.assertTrue(ops.allclose(scores[-1], mindspore.tensor(0.9566), atol=2e-4))
-        self.assertTrue(
-            ops.allclose(masks, mindspore.tensor([-12.7729, -12.3665, -12.6061]), atol=2e-4)
-        )
-
-    def test_inference_mask_generation_batched_points_batched_images(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        model.eval()
-
-        raw_image = prepare_image()
-        input_points = [
-            [[[820, 1080]], [[820, 1080]], [[820, 1080]], [[820, 1080]]],
-            [[[510, 1080]], [[820, 1080]], [[820, 1080]], [[820, 1080]]],
-        ]
-
-        inputs = processor(images=[raw_image, raw_image], input_points=input_points, return_tensors="ms")
-        with no_grad():
-            outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
-        masks = outputs.pred_masks[0, 0, 0, 0, :3]
-
-        EXPECTED_SCORES = mindspore.tensor(
-            [
-                [
-                    [0.6765, 0.9379, 0.8803],
-                    [0.6765, 0.9379, 0.8803],
-                    [0.6765, 0.9379, 0.8803],
-                    [0.6765, 0.9379, 0.8803],
-                ],
-                [
-                    [0.3317, 0.7264, 0.7646],
-                    [0.6765, 0.9379, 0.8803],
-                    [0.6765, 0.9379, 0.8803],
-                    [0.6765, 0.9379, 0.8803],
-                ],
-            ]
-        )
-        EXPECTED_MASKS = mindspore.tensor([-2.8550, -2.7988, -2.9625])
-        self.assertTrue(ops.allclose(scores, EXPECTED_SCORES, atol=1e-3))
-        self.assertTrue(ops.allclose(masks, EXPECTED_MASKS, atol=1e-3))
-
-    def test_inference_mask_generation_one_point_one_bb_zero(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        model.eval()
-
-        raw_image = prepare_image()
-        input_boxes = [[[620, 900, 1000, 1255]]]
-        input_points = [[[820, 1080]]]
-        labels = [[0]]
-
-        inputs = processor(
-            images=raw_image,
-            input_boxes=input_boxes,
-            input_points=input_points,
-            input_labels=labels,
-            return_tensors="ms",
-        )
-
-        with no_grad():
-            outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
-
-        self.assertTrue(ops.allclose(scores[-1], mindspore.tensor(0.7894), atol=1e-4))
-
-    def test_inference_mask_generation_one_point(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        model.eval()
-
-        raw_image = prepare_image()
-
-        input_points = [[[400, 650]]]
-        input_labels = [[1]]
-
-        inputs = processor(
-            images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="ms"
-        )
-
-        with no_grad():
-            outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
-        self.assertTrue(ops.allclose(scores[-1], mindspore.tensor(0.9675), atol=1e-4))
-
-        # With no label
-        input_points = [[[400, 650]]]
-
-        inputs = processor(images=raw_image, input_points=input_points, return_tensors="ms")
-
-        with no_grad():
-            outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
-        self.assertTrue(ops.allclose(scores[-1], mindspore.tensor(0.9675), atol=1e-4))
-
-    def test_inference_mask_generation_two_points(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        model.eval()
-
-        raw_image = prepare_image()
-
-        input_points = [[[400, 650], [800, 650]]]
-        input_labels = [[1, 1]]
-
-        inputs = processor(
-            images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors="ms"
-        )
-
-        with no_grad():
-            outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
-        self.assertTrue(ops.allclose(scores[-1], mindspore.tensor(0.9762), atol=1e-4))
-
-        # no labels
-        inputs = processor(images=raw_image, input_points=input_points, return_tensors="ms")
-
-        with no_grad():
-            outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
-
-        self.assertTrue(ops.allclose(scores[-1], mindspore.tensor(0.9762), atol=1e-4))
-
-    def test_inference_mask_generation_two_points_batched(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        model.eval()
-
-        raw_image = prepare_image()
-
-        input_points = [[[400, 650], [800, 650]], [[400, 650]]]
-        input_labels = [[1, 1], [1]]
-
-        inputs = processor(
-            images=[raw_image, raw_image], input_points=input_points, input_labels=input_labels, return_tensors="ms"
-        )
-
-        with no_grad():
-            outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
-        self.assertTrue(ops.allclose(scores[0][-1], mindspore.tensor(0.9762), atol=1e-4))
-        self.assertTrue(ops.allclose(scores[1][-1], mindspore.tensor(0.9637), atol=1e-4))
-
-    def test_inference_mask_generation_one_box(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        model.eval()
-
-        raw_image = prepare_image()
-
-        input_boxes = [[[75, 275, 1725, 850]]]
-
-        inputs = processor(images=raw_image, input_boxes=input_boxes, return_tensors="ms")
-
-        with no_grad():
-            outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
-        self.assertTrue(ops.allclose(scores[-1], mindspore.tensor(0.7937), atol=1e-4))
-
-    def test_inference_mask_generation_batched_image_one_point(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        model.eval()
-
-        raw_image = prepare_image()
-        raw_dog_image = prepare_dog_img()
-
-        input_points = [[[820, 1080]], [[220, 470]]]
-
-        inputs = processor(images=[raw_image, raw_dog_image], input_points=input_points, return_tensors="ms")
-
-        with no_grad():
-            outputs = model(**inputs)
-        scores_batched = outputs.iou_scores.squeeze()
-
-        input_points = [[[220, 470]]]
-
-        inputs = processor(images=raw_dog_image, input_points=input_points, return_tensors="ms")
-
-        with no_grad():
-            outputs = model(**inputs)
-        scores_single = outputs.iou_scores.squeeze()
-
-        self.assertTrue(ops.allclose(scores_batched[1, :], scores_single, atol=1e-4))
-
-    def test_inference_mask_generation_two_points_point_batch(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        model.eval()
-
-        raw_image = prepare_image()
-
-        input_points = mindspore.Tensor([[[400, 650]], [[220, 470]]])  # fmt: skip
-
-        input_points = input_points.unsqueeze(0)
-
-        inputs = processor(raw_image, input_points=input_points, return_tensors="ms")
-
-        with no_grad():
-            outputs = model(**inputs)
-
-        iou_scores = outputs.iou_scores
-        self.assertTrue(iou_scores.shape == (1, 2, 3))
-        assert ops.allclose(
-            iou_scores, mindspore.tensor([[[0.9105, 0.9825, 0.9675], [0.7646, 0.7943, 0.7774]]]), atol=1e-4, rtol=1e-4
-        )
-
-    def test_inference_mask_generation_three_boxes_point_batch(self):
-        model = SamModel.from_pretrained("facebook/sam-vit-base")
-        processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
-
-        model.eval()
-
-        raw_image = prepare_image()
-
-        # fmt: off
-        input_boxes = mindspore.Tensor([[[620, 900, 1000, 1255]], [[75, 275, 1725, 850]],  [[75, 275, 1725, 850]]])
-        EXPECTED_IOU = mindspore.tensor([[[0.9773, 0.9881, 0.9522],
-         [0.5996, 0.7661, 0.7937],
-         [0.5996, 0.7661, 0.7937]]])
-        # fmt: on
-        input_boxes = input_boxes.unsqueeze(0)
-
-        inputs = processor(raw_image, input_boxes=input_boxes, return_tensors="ms")
-
-        with no_grad():
-            outputs = model(**inputs)
-
-        iou_scores = outputs.iou_scores
-        self.assertTrue(iou_scores.shape == (1, 3, 3))
-        assert ops.allclose(iou_scores, EXPECTED_IOU, atol=1e-4, rtol=1e-4)
-
-    def test_dummy_pipeline_generation(self):
-        generator = pipeline("mask-generation", model="facebook/sam-vit-base")
-        raw_image = prepare_image()
-
-        _ = generator(raw_image, points_per_batch=64)
\ No newline at end of file
diff --git a/tests/transformers/models/sam/test_processor_sam.py b/tests/transformers/models/sam/test_processor_sam.py
deleted file mode 100644
index c1daf8094..000000000
--- a/tests/transformers/models/sam/test_processor_sam.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-)
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import AutoProcessor, SamImageProcessor, SamProcessor
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-
-@require_vision
-class SamProcessorTest(unittest.TestCase):
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-        image_processor = SamImageProcessor()
-        processor = SamProcessor(image_processor)
-        processor.save_pretrained(self.tmpdirname)
-
-    def get_image_processor(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def prepare_image_inputs(self):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
-        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-        return image_inputs
-
-    def prepare_mask_inputs(self):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-        mask_inputs = [np.random.randint(255, size=(30, 400), dtype=np.uint8)]
-        mask_inputs = [Image.fromarray(x) for x in mask_inputs]
-        return mask_inputs
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = SamProcessor(image_processor=self.get_image_processor())
-        processor.save_pretrained(self.tmpdirname)
-
-        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
-
-        processor = SamProcessor.from_pretrained(self.tmpdirname, do_normalize=False, padding_value=1.0)
-
-        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, SamImageProcessor)
-
-    def test_image_processor_no_masks(self):
-        image_processor = self.get_image_processor()
-
-        processor = SamProcessor(image_processor=image_processor)
-
-        image_input = self.prepare_image_inputs()
-
-        input_feat_extract = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-        for image in input_feat_extract.pixel_values:
-            self.assertEqual(image.shape, (3, 1024, 1024))
-
-        for original_size in input_feat_extract.original_sizes:
-            np.testing.assert_array_equal(original_size, np.array([30, 400]))
-
-        for reshaped_input_size in input_feat_extract.reshaped_input_sizes:
-            np.testing.assert_array_equal(
-                reshaped_input_size, np.array([77, 1024])
-            )  # reshaped_input_size value is before padding
-
-    def test_image_processor_with_masks(self):
-        image_processor = self.get_image_processor()
-
-        processor = SamProcessor(image_processor=image_processor)
-
-        image_input = self.prepare_image_inputs()
-        mask_input = self.prepare_mask_inputs()
-
-        input_feat_extract = image_processor(images=image_input, segmentation_maps=mask_input, return_tensors="np")
-        input_processor = processor(images=image_input, segmentation_maps=mask_input, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-        for label in input_feat_extract.labels:
-            self.assertEqual(label.shape, (256, 256))
-
-    @require_mindspore
-    def test_post_process_masks(self):
-        image_processor = self.get_image_processor()
-
-        processor = SamProcessor(image_processor=image_processor)
-        dummy_masks = [ops.ones((1, 3, 5, 5))]
-
-        original_sizes = [[1764, 2646]]
-
-        reshaped_input_size = [[683, 1024]]
-        masks = processor.post_process_masks(dummy_masks, original_sizes, reshaped_input_size)
-        self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
-
-        masks = processor.post_process_masks(
-            dummy_masks, mindspore.tensor(original_sizes), mindspore.tensor(reshaped_input_size)
-        )
-        self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
-
-        # should also work with np
-        dummy_masks = [np.ones((1, 3, 5, 5))]
-        masks = processor.post_process_masks(dummy_masks, np.array(original_sizes), np.array(reshaped_input_size))
-
-        self.assertEqual(masks[0].shape, (1, 3, 1764, 2646))
-
-        dummy_masks = [[1, 0], [0, 1]]
-        with self.assertRaises(ValueError):
-            masks = processor.post_process_masks(dummy_masks, np.array(original_sizes), np.array(reshaped_input_size))
-
-
-@require_vision
-class SamProcessorEquivalenceTest(unittest.TestCase):
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-        image_processor = SamImageProcessor()
-        processor = SamProcessor(image_processor)
-        processor.save_pretrained(self.tmpdirname)
-
-    def get_image_processor(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def prepare_image_inputs(self):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
-
-        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        return image_inputs
diff --git a/tests/transformers/models/seamless_m4t/__init__.py b/tests/transformers/models/seamless_m4t/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/transformers/models/seamless_m4t/test_modeling_seamless_m4t.py
deleted file mode 100644
index 1cd6553aa..000000000
--- a/tests/transformers/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ /dev/null
@@ -1,1132 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore SeamlessM4T model."""
-
-import copy
-import tempfile
-import unittest
-
-from mindnlp.transformers import SeamlessM4TConfig
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-from mindnlp.engine import set_seed
-from mindnlp.utils import cached_property
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import (
-        SeamlessM4TForSpeechToSpeech,
-        SeamlessM4TForSpeechToText,
-        SeamlessM4TForTextToSpeech,
-        SeamlessM4TForTextToText,
-        SeamlessM4TModel,
-    )
-    from mindnlp.transformers import SeamlessM4TProcessor
-
-
-class SeamlessM4TModelTester:
-    def __init__(
-        self,
-        parent,
-        input_modality="speech",
-        batch_size=2,
-        seq_length=4,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        initializer_range=0.02,
-        max_new_tokens=None,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        vocab_size=20,
-        t2u_vocab_size=20,
-        hidden_size=6,
-        num_hidden_layers=2,
-        intermediate_size=6,
-        max_position_embeddings=256,
-        encoder_layers=2,
-        decoder_layers=2,
-        encoder_ffn_dim=6,
-        decoder_ffn_dim=6,
-        t2u_encoder_layers=2,
-        t2u_decoder_layers=2,
-        t2u_encoder_ffn_dim=6,
-        t2u_decoder_ffn_dim=6,
-        num_heads=2,
-        vocoder_num_spkrs=5,
-        vocoder_num_langs=5,
-        upsample_initial_channel=32,
-        unit_embed_dim=25,
-        spkr_embed_dim=6,
-        lang_embed_dim=6,
-        num_conv_pos_embeddings=8,
-        unit_hifi_gan_vocab_size=20,
-        t2u_num_langs=0,
-        t2u_max_new_tokens=25,
-        t2u_offset_tgt_lang=0,
-        vocoder_offset=0,
-    ):
-        self.parent = parent
-        self.input_modality = input_modality
-
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-        self.vocab_size = vocab_size
-        self.t2u_vocab_size = t2u_vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.max_position_embeddings = max_position_embeddings
-        self.encoder_layers = encoder_layers
-        self.decoder_layers = decoder_layers
-        self.encoder_ffn_dim = encoder_ffn_dim
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.t2u_encoder_layers = t2u_encoder_layers
-        self.t2u_decoder_layers = t2u_decoder_layers
-        self.t2u_encoder_ffn_dim = t2u_encoder_ffn_dim
-        self.t2u_decoder_ffn_dim = t2u_decoder_ffn_dim
-        self.num_heads = num_heads
-        self.num_attention_heads = num_heads
-
-        self.vocoder_num_spkrs = vocoder_num_spkrs
-        self.vocoder_num_langs = vocoder_num_langs
-        self.upsample_initial_channel = upsample_initial_channel
-        self.unit_embed_dim = unit_embed_dim
-        self.spkr_embed_dim = spkr_embed_dim
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.lang_embed_dim = lang_embed_dim
-
-        self.max_new_tokens = max_new_tokens
-
-        self.unit_hifi_gan_vocab_size = unit_hifi_gan_vocab_size
-        self.t2u_num_langs = t2u_num_langs
-        self.t2u_max_new_tokens = t2u_max_new_tokens
-        self.t2u_offset_tgt_lang = t2u_offset_tgt_lang
-        self.vocoder_offset = vocoder_offset
-
-    def prepare_config_and_inputs(self):
-        if self.input_modality == "text":
-            inputs = ids_tensor([self.batch_size, self.seq_length], self.vocab_size - 1)
-        else:
-            inputs = ids_tensor([self.batch_size, self.seq_length, 160], self.vocab_size - 1).float()
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size - 1)
-
-        lm_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-
-        config = self.get_config()
-
-        return config, inputs, decoder_input_ids, input_mask, lm_labels
-
-    def get_config(self):
-        return SeamlessM4TConfig(
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            t2u_vocab_size=self.t2u_vocab_size,
-            hidden_size=self.hidden_size,
-            speech_encoder_layers=self.num_heads,
-            speech_encoder_intermediate_size=self.intermediate_size,
-            max_position_embeddings=self.max_position_embeddings,
-            encoder_layers=self.encoder_layers,
-            decoder_layers=self.decoder_layers,
-            encoder_ffn_dim=self.encoder_ffn_dim,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            t2u_encoder_layers=self.t2u_encoder_layers,
-            t2u_decoder_layers=self.t2u_decoder_layers,
-            t2u_encoder_ffn_dim=self.t2u_encoder_ffn_dim,
-            t2u_decoder_ffn_dim=self.t2u_decoder_ffn_dim,
-            num_attention_heads=self.num_heads,
-            encoder_attention_heads=self.num_heads,
-            decoder_attention_heads=self.num_heads,
-            t2u_encoder_attention_heads=self.num_heads,
-            t2u_decoder_attention_heads=self.num_heads,
-            speech_encoder_attention_heads=self.num_heads,
-            unit_hifigan_vocab_vise=self.t2u_vocab_size,
-            vocoder_num_spkrs=self.vocoder_num_spkrs,
-            vocoder_num_langs=self.vocoder_num_langs,
-            upsample_initial_channel=self.upsample_initial_channel,
-            unit_embed_dim=self.unit_embed_dim,
-            spkr_embed_dim=self.spkr_embed_dim,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            lang_embed_dim=self.lang_embed_dim,
-            max_new_tokens=self.max_new_tokens,
-            unit_hifi_gan_vocab_size=self.unit_hifi_gan_vocab_size,
-            t2u_num_langs=self.t2u_num_langs,
-            t2u_max_new_tokens=self.t2u_max_new_tokens,
-            t2u_offset_tgt_lang=self.t2u_offset_tgt_lang,
-            vocoder_offset=self.vocoder_offset,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            input_mask,
-            lm_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            decoder_input_ids,
-            input_mask,
-            lm_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(self, config, input_ids, decoder_input_ids, input_mask, labels):
-        model = SeamlessM4TModel(config=config)
-        model.eval()
-        if self.input_modality == "text":
-            result = model(input_ids=input_ids, attention_mask=input_mask, decoder_input_ids=decoder_input_ids)
-            result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        else:
-            result = model(input_features=input_ids, attention_mask=input_mask, decoder_input_ids=decoder_input_ids)
-            result = model(input_features=input_ids, decoder_input_ids=decoder_input_ids)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-        decoder_output = result.logits
-        decoder_past = result.past_key_values
-        encoder_output = result.encoder_last_hidden_state
-
-        if self.input_modality == "text":
-            seq_length = self.seq_length
-        else:
-            # if speech, expected length has been subsampled.
-            seq_length = model._compute_sub_sample_lengths_from_attention_mask(input_mask).max().item()
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, seq_length, self.hidden_size))
-        self.parent.assertEqual(decoder_output.shape, (self.batch_size, decoder_input_ids.shape[1], self.vocab_size))
-        # There should be `num_layers` key value embeddings stored in decoder_past
-        self.parent.assertEqual(len(decoder_past), config.decoder_layers)
-        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
-        self.parent.assertEqual(len(decoder_past[0]), 4)
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        input_mask,
-        lm_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        model = SeamlessM4TModel(config=config)
-        model.eval()
-
-        # make sure no pad token in decoder_input_ids
-        decoder_input_ids = ops.clamp(decoder_input_ids, config.pad_token_id + 1)
-
-        # first forward pass
-        outputs = model(
-            input_ids, decoder_input_ids=decoder_input_ids, decoder_attention_mask=input_mask, use_cache=True
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([decoder_input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            input_ids,
-            decoder_input_ids=next_input_ids,
-            decoder_attention_mask=next_attention_mask,
-            output_hidden_states=True,
-        )
-        output_from_no_past = output_from_no_past["decoder_hidden_states"][0]
-        output_from_past = model(
-            input_ids,
-            decoder_input_ids=next_tokens,
-            decoder_attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["decoder_hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            input_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        input_name = "input_ids" if self.input_modality == "text" else "input_features"
-
-        inputs_dict = {
-            input_name: input_ids,
-            "attention_mask": input_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "labels": lm_labels,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class SeamlessM4TModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase):
-    is_encoder_decoder = True
-    fx_compatible = False
-    test_missing_keys = False
-    test_pruning = False
-    test_model_parallel = False
-    test_resize_embeddings = False
-    test_headmasking = False
-    test_torchscript = False
-
-    all_model_classes = (
-        (
-            SeamlessM4TModel,
-            SeamlessM4TForSpeechToSpeech,
-            SeamlessM4TForSpeechToText,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (SeamlessM4TForSpeechToText,) if is_mindspore_available() else ()
-
-    input_name = "input_features"
-
-    def setUp(self):
-        self.model_tester = SeamlessM4TModelTester(self, input_modality="speech")
-        self.config_tester = ConfigTester(self, config_class=SeamlessM4TConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/hf-seamless-m4t-medium"
-        model = SeamlessM4TModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict[self.input_name]
-
-        # cut to half length & take max batch_size 3
-        sequence_length = input_ids.shape[-1] // 2
-        input_ids = input_ids[:batch_size, :sequence_length]
-
-        # generate max 3 tokens
-        max_length = input_ids.shape[-1] + 3
-        if config.eos_token_id is not None and config.pad_token_id is None:
-            # hack to allow generate for models such as GPT2 as is done in `generate()`
-            if isinstance(config.eos_token_id, int):
-                config.eos_token_id = [config.eos_token_id]
-            config.pad_token_id = config.eos_token_id[0]
-
-        attention_mask = ops.ones(input_ids.shape[:2], dtype=mindspore.int64)[:batch_size, :sequence_length]
-
-        return config, input_ids.float(), attention_mask, max_length
-
-    @staticmethod
-    def _get_encoder_outputs(
-        model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
-    ):
-        encoder = model.get_encoder()
-        encoder_outputs = encoder(
-            input_ids,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-        )
-        encoder_outputs["last_hidden_state"] = ops.repeat_interleave(encoder_outputs.last_hidden_state, 
-            num_interleave, dim=0
-        )
-        generation_config = copy.deepcopy(model.generation_config)
-        model._prepare_special_tokens(generation_config)
-        input_ids = (
-            ops.zeros(input_ids.shape[:2], dtype=mindspore.int64)
-            + generation_config.decoder_start_token_id
-        )
-        attention_mask = None
-        return encoder_outputs, input_ids, attention_mask
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "pos_bias_v",
-                    "pos_bias_u",
-                    "pointwise_conv1",
-                    "pointwise_conv2",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                    "adapter",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    @unittest.skip(reason="SeamlessM4TSpeechEncoder doesn't have an embedding layer")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="SeamlessM4TSpeechEncoder doesn't have an embedding layer")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(
-        reason="Expected missing keys serve when using SeamlessM4TForXXX.from_pretrained from a checkpoint saved by SeamlessM4TModel.save_pretrained."
-    )
-    def test_model_weights_reload_no_missing_tied_weights(self):
-        pass
-
-    @unittest.skip(
-        reason="SeamlessM4TModel is base class but has actually a bigger architecture than seamlessM4T task-specific models."
-    )
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(reason="SeamlessM4TModel can takes input_ids or input_features")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip(reason="SeamlessM4T has no base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
-    )
-    def test_load_save_without_tied_weights(self):
-        pass
-
-    def test_attention_outputs(self):
-        # expected length is subsampled so need to change a bit this test
-        if not self.has_attentions:
-            self.skipTest(reason="Model does not output attentions")
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        # no more chunk_length test
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            if self.is_encoder_decoder:
-                correct_outlen = 5
-
-                # loss is at first position
-                if "labels" in inputs_dict:
-                    correct_outlen += 1  # loss is added to beginning
-                if "past_key_values" in outputs:
-                    correct_outlen += 1  # past_key_values have been returned
-
-                self.assertEqual(out_len, correct_outlen)
-
-                # decoder attentions
-                decoder_attentions = outputs.decoder_attentions
-                self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
-
-                # cross attentions
-                cross_attentions = outputs.cross_attentions
-                self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-
-                sub_sampled_length = (
-                    model._compute_sub_sample_lengths_from_attention_mask(inputs_dict["attention_mask"]).max().item()
-                )
-                self.assertListEqual(
-                    list(cross_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        sub_sampled_length,
-                    ],
-                )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-
-
-@require_mindspore
-class SeamlessM4TModelWithTextInputTest(
-    ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
-):
-    is_encoder_decoder = True
-    fx_compatible = False
-    test_missing_keys = False
-    test_pruning = False
-    test_model_parallel = False
-    test_resize_embeddings = True
-    test_headmasking = False
-    test_torchscript = False
-
-    all_model_classes = (
-        (
-            SeamlessM4TModel,
-            SeamlessM4TForTextToSpeech,
-            SeamlessM4TForTextToText,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (SeamlessM4TForTextToText,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "automatic-speech-recognition": SeamlessM4TForSpeechToText,
-            "feature-extraction": SeamlessM4TModel,
-            "summarization": SeamlessM4TForTextToText,
-            "text-to-audio": SeamlessM4TForTextToSpeech,
-            "text2text-generation": SeamlessM4TForTextToText,
-            "translation": SeamlessM4TForTextToText,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    def setUp(self):
-        self.model_tester = SeamlessM4TModelTester(self, input_modality="text")
-        self.config_tester = ConfigTester(self, config_class=SeamlessM4TConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/hf-seamless-m4t-medium"
-        model = SeamlessM4TModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "pos_bias_v",
-                    "pos_bias_u",
-                    "pointwise_conv1",
-                    "pointwise_conv2",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                    "adapter",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    @unittest.skip(
-        reason="Expected missing keys serve when using SeamlessM4TForXXX.from_pretrained from a checkpoint saved by SeamlessM4TModel.save_pretrained."
-    )
-    def test_model_weights_reload_no_missing_tied_weights(self):
-        pass
-
-    @unittest.skip(reason="SeamlessM4TModel can take input_ids or input_features")
-    def test_forward_signature(self):
-        pass
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    @unittest.skip(
-        reason="SeamlessM4TModel is base class but has actually a bigger architecture than seamlessM4T task-specific models."
-    )
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(reason="SeamlessM4T has no base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(
-        reason="In training model, the first encoder layer is sometimes skipped. Training is not supported yet, so the test is ignored."
-    )
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
-    )
-    def test_load_save_without_tied_weights(self):
-        pass
-
-
-@require_mindspore
-class SeamlessM4TGenerationTest(unittest.TestCase):
-    # test that non-standard generation works
-    # test generation of: SeamlessM4TModel, SeamlessM4TForSpeechToSpeech, SeamlessM4TForSpeechToText, SeamlessM4TForTextToSpeech
-
-    def setUp(self):
-        self.speech_model_tester = SeamlessM4TModelTester(self, input_modality="speech")
-        self.text_model_tester = SeamlessM4TModelTester(self, input_modality="text")
-        self.tmpdirname = tempfile.mkdtemp()
-
-    def update_generation(self, model):
-        lang_code_to_id = {
-            "fra": 4,
-            "eng": 4,
-        }
-
-        generation_config = copy.deepcopy(model.generation_config)
-
-        generation_config.__setattr__("text_decoder_lang_to_code_id", lang_code_to_id)
-        generation_config.__setattr__("t2u_lang_code_to_id", lang_code_to_id)
-        generation_config.__setattr__("vocoder_lang_code_to_id", lang_code_to_id)
-
-        generation_config._from_model_config = False
-
-        model.generation_config = generation_config
-
-    def prepare_text_input(self):
-        config, inputs, decoder_input_ids, input_mask, lm_labels = self.text_model_tester.prepare_config_and_inputs()
-
-        input_dict = {
-            "input_ids": inputs,
-            "attention_mask": input_mask,
-            "tgt_lang": "eng",
-            "num_beams": 2,
-            "do_sample": True,
-        }
-
-        return config, input_dict
-
-    def prepare_speech_input(self):
-        config, inputs, decoder_input_ids, input_mask, lm_labels = self.speech_model_tester.prepare_config_and_inputs()
-
-        input_dict = {
-            "input_features": inputs,
-            "attention_mask": input_mask,
-            "tgt_lang": "fra",
-            "num_beams": 2,
-            "do_sample": True,
-        }
-
-        return config, input_dict
-
-    def prepare_speech_and_text_input(self):
-        config, inputs, decoder_input_ids, input_mask, lm_labels = self.speech_model_tester.prepare_config_and_inputs()
-
-        input_speech = {
-            "input_features": inputs,
-            "attention_mask": input_mask,
-            "tgt_lang": "fra",
-            "num_beams": 2,
-            "do_sample": True,
-        }
-
-        config, inputs, decoder_input_ids, input_mask, lm_labels = self.text_model_tester.prepare_config_and_inputs()
-
-        input_text = {
-            "input_ids": inputs,
-            "attention_mask": input_mask,
-            "tgt_lang": "eng",
-            "num_beams": 2,
-            "do_sample": True,
-        }
-        return config, input_speech, input_text
-
-    def factory_generation_speech_test(self, model, inputs):
-        set_seed(0)
-        output = model.generate(**inputs)
-        return output
-
-    def test_speech_generation(self):
-        config, input_speech, input_text = self.prepare_speech_and_text_input()
-
-        model = SeamlessM4TModel(config=config)
-        self.update_generation(model)
-        model.save_pretrained(self.tmpdirname)
-        model.eval()
-
-        output_original_text = self.factory_generation_speech_test(model, input_text)
-        output_original_speech = self.factory_generation_speech_test(model, input_speech)
-
-        state_dict = model.state_dict()
-
-        text_model = SeamlessM4TForTextToSpeech.from_pretrained(self.tmpdirname)
-        self.update_generation(text_model)
-        text_model.eval()
-
-        output_text = self.factory_generation_speech_test(model, input_text)
-
-        speech_model = SeamlessM4TForSpeechToSpeech.from_pretrained(self.tmpdirname)
-        self.update_generation(speech_model)
-        speech_model.eval()
-
-        for name, tensor in speech_model.state_dict().items():
-            right_tensor = state_dict.get(name)
-            self.assertEqual(tensor.tolist(), right_tensor.tolist(), f"Tensor {name}")
-
-        output_speech = self.factory_generation_speech_test(model, input_speech)
-
-        # test same text output from input text
-        self.assertListEqual(output_original_text[0].ravel().tolist(), output_text[0].ravel().tolist())
-        self.assertListEqual(output_original_text[1].ravel().tolist(), output_text[1].ravel().tolist())
-
-        # test same speech output from input text
-        # assertTrue because super long list makes this hang in case of failure
-        self.assertTrue(
-            output_original_speech[0].ravel().tolist() == output_speech[0].ravel().tolist(),
-            "Speech generated was different",
-        )
-        self.assertTrue(
-            output_original_speech[1].ravel().tolist() == output_speech[1].ravel().tolist(),
-            "Speech generated was different",
-        )
-
-    def test_text_generation(self):
-        config, input_speech, input_text = self.prepare_speech_and_text_input()
-
-        # to return speech
-        input_speech["generate_speech"] = False
-        input_text["generate_speech"] = False
-
-        model = SeamlessM4TModel(config=config)
-        self.update_generation(model)
-        model.save_pretrained(self.tmpdirname)
-        model.eval()
-
-        output_original_text = self.factory_generation_speech_test(model, input_text)
-        output_original_speech = self.factory_generation_speech_test(model, input_speech)
-
-        # other models don't need it
-        input_speech.pop("generate_speech")
-        input_text.pop("generate_speech")
-
-        state_dict = model.state_dict()
-
-        text_model = SeamlessM4TForTextToText.from_pretrained(self.tmpdirname)
-        self.update_generation(text_model)
-        text_model.eval()
-
-        for name, tensor in text_model.state_dict().items():
-            right_tensor = state_dict.get(name)
-            self.assertEqual(tensor.tolist(), right_tensor.tolist())
-
-        output_text = self.factory_generation_speech_test(text_model, input_text)
-
-        speech_model = SeamlessM4TForSpeechToText.from_pretrained(self.tmpdirname)
-
-        for name, tensor in speech_model.state_dict().items():
-            right_tensor = state_dict.get(name)
-            self.assertEqual(tensor.tolist(), right_tensor.tolist(), f"Tensor {name}")
-
-        self.update_generation(speech_model)
-        speech_model.eval()
-
-        output_speech = self.factory_generation_speech_test(speech_model, input_speech)
-
-        # test same text output from input text
-        self.assertListEqual(output_original_text[0].ravel().tolist(), output_text.ravel().tolist())
-
-        # test same speech output from input text
-        self.assertListEqual(output_original_speech[0].ravel().tolist(), output_speech.ravel().tolist())
-
-    def test_generation(self):
-        config, input_speech, input_text = self.prepare_speech_and_text_input()
-
-        input_speech["num_beams"] = 3
-        input_speech["do_sample"] = True
-        input_speech["num_return_sequences"] = 3
-
-        input_text["num_beams"] = 3
-        input_text["do_sample"] = True
-        input_text["num_return_sequences"] = 3
-
-        for model_class in [SeamlessM4TForSpeechToSpeech, SeamlessM4TForSpeechToText, SeamlessM4TModel]:
-            model = model_class(config=config)
-            self.update_generation(model)
-            model.eval()
-
-            output = model.generate(**input_speech)
-            output = output[0] if isinstance(output, tuple) else output
-
-            self.assertEqual(output.shape[0], 3 * input_speech["input_features"].shape[0])
-
-        for model_class in [SeamlessM4TForTextToSpeech, SeamlessM4TForTextToText, SeamlessM4TModel]:
-            model = model_class(config=config)
-            self.update_generation(model)
-            model.eval()
-
-            output = model.generate(**input_text)
-
-            output = output[0] if isinstance(output, tuple) else output
-
-            self.assertEqual(output.shape[0], 3 * input_text["input_ids"].shape[0])
-
-
-@require_mindspore
-class SeamlessM4TModelIntegrationTest(unittest.TestCase):
-    repo_id = "facebook/hf-seamless-m4t-medium"
-
-    def assertListAlmostEqual(self, list1, list2, tol=1e-3):
-        self.assertEqual(len(list1), len(list2))
-        for a, b in zip(list1, list2):
-            self.assertAlmostEqual(a, b, delta=tol)
-
-    @cached_property
-    def processor(self):
-        return SeamlessM4TProcessor.from_pretrained(self.repo_id)
-
-    @cached_property
-    def input_text(self):
-        # corresponds to "C'est un test." with seamlessM4T_medium checkpoint
-
-        input_ids = mindspore.tensor([[256057, 152, 248116, 354, 159, 7356, 248075, 3]])  # fmt: skip
-
-        attention_mask = ops.ones_like(input_ids)
-
-        inputs = {
-            "attention_mask": attention_mask,
-            "input_ids": input_ids,
-        }
-
-        return inputs
-
-    @cached_property
-    def input_audio(self):
-        set_seed(0)
-        seq_len = 20000
-        sampling_rate = 16000
-        input_features = ops.rand((2, seq_len))
-
-        return self.processor(audios=[input_features.tolist()], sampling_rate=sampling_rate, return_tensors="ms")
-
-    def factory_test_task(self, class1, class2, inputs, class1_kwargs, class2_kwargs):
-        model1 = class1.from_pretrained(self.repo_id)
-        model2 = class2.from_pretrained(self.repo_id)
-
-        set_seed(0)
-        output_1 = model1.generate(**inputs, **class1_kwargs)
-        set_seed(0)
-        output_2 = model2.generate(**inputs, **class2_kwargs)
-
-        for key in output_1:
-            if isinstance(output_1[key], mindspore.Tensor):
-                if len(output_1[key].shape) == 0:
-                    self.assertEqual(output_1[key].item(), output_2[key].item())
-                else:
-                    self.assertListAlmostEqual(output_1[key].squeeze().tolist(), output_2[key].squeeze().tolist())
-
-    @slow
-    def test_to_eng_text(self):
-        model = SeamlessM4TModel.from_pretrained(self.repo_id)
-
-        # test text - tgt lang: eng
-
-        expected_text_tokens = [3, 256047, 3291, 248116, 248066, 9, 7356, 248075, 3]  # fmt: skip
-
-        # fmt: off
-        expected_unit_tokens = [
-            2,10051,8980,8212,949,1270,4311,1123,5918,2333,5311,3882,2415,5284,1123,612,8816,6370,5386,7334,4345,5645,
-            9437,5748,1378,9818,4319,7968,7375,2909,9119,5151,8728,5335,3896,4013,8939,8885,6048,9530,3167,5833,1072,693,
-            431,9867,364,7909,4608,5938,1889,9984,7947,4944,6171,3767,9861,9169,1187,8365,4571,7635,7784,7635,800,2393,
-            32,5380,5852,8289,2530,2762,1833,2056,3553,4641,3553,5683,370,2288,1344,1518,7534,703,8359,7699,2
-        ]
-        # fmt: on
-
-        expected_wav_slice = [-3e-05, -0.0004, -0.00037, -0.00013, -6e-05, 0.00012, -0.00016, 0.00025, 7e-05, -3e-05]  # fmt: skip
-
-        set_seed(0)
-        output = model.generate(**self.input_text, num_beams=1, tgt_lang="eng", return_intermediate_token_ids=True)
-
-        self.assertListEqual(expected_text_tokens, output.sequences.squeeze().tolist())
-        # FOR NOW, only first units correspondance
-        self.assertListEqual(expected_unit_tokens[:10], output.unit_sequences.squeeze().tolist()[:10])
-
-        self.assertListAlmostEqual(expected_wav_slice, output.waveform.squeeze().tolist()[50:60])
-
-    @slow
-    def test_to_swh_text(self):
-        model = SeamlessM4TModel.from_pretrained(self.repo_id)
-
-        # test text - tgt lang: swh
-
-        expected_text_tokens = [3, 256168, 1665, 188589, 7040, 248075, 3]  # fmt: skip
-
-        # fmt: off
-        expected_unit_tokens = [
-            2,10071,5729,9995,3089,7546,1204,1721,2532,4340,5623,3496,432,7730,9096,7677,3143,8211,6447,8399,4248,3565,
-            4529,7700,9308,217,6476,3485,9667,3194,8476,4923,5593,1148,4466,7416,4872,463,4872,253,2348,4640,3450,2133,
-            6318,2806,817,7613,2698,6563,8712,8344,9286,6878,6387,4281,6387,640,6387,3200,640,8355,640,6708,979,1738,2
-        ]
-        # fmt: on
-
-        expected_wav_slice = [1e-05, -7e-05, -4e-05, -4e-05, -6e-05, -9e-05, -0.0001, -2e-05, -7e-05, -2e-05]  # fmt: skip
-
-        set_seed(0)
-        output = model.generate(**self.input_text, num_beams=1, tgt_lang="swh", return_intermediate_token_ids=True)
-
-        self.assertListEqual(expected_text_tokens, output.sequences.squeeze().tolist())
-        self.assertListEqual(expected_unit_tokens[:10], output.unit_sequences.squeeze().tolist()[:10])
-
-        self.assertListAlmostEqual(expected_wav_slice, output.waveform.squeeze().tolist()[50:60])
-
-    @slow
-    def test_to_rus_speech(self):
-        model = SeamlessM4TModel.from_pretrained(self.repo_id)
-
-        # test audio - tgt lang: rus
-
-        expected_text_tokens = [3, 256147, 1197, 73565, 3413, 537, 233331, 248075, 3]  # fmt: skip
-
-        # fmt: off
-        expected_unit_tokens = [
-            2, 10067, 5729, 4798, 9631, 8378, 4446, 2393, 6901, 5983, 2817, 4629, 8532, 1991, 2931, 8576, 8857, 5936, 4317,
-            9000, 7740, 7995, 1225, 5980, 6094, 1420, 5373, 8771, 6600, 4487, 7029, 3630, 6740, 4870, 1483, 3003, 5585, 5511,
-            7465, 3222, 32, 6272, 1950, 3120, 5368, 639, 3713, 5935, 7943, 567, 6129, 6822, 1226, 5063, 9878, 7756, 8825, 1078, 5943,
-            457, 9282, 9668, 817, 7613, 2698, 6563, 8712, 8704, 9286, 8704, 6387, 4281, 6387, 640, 3200, 6387, 640, 8355, 6708, 979, 1738, 2
-        ]
-        # fmt: on
-
-        expected_wav_slice = [0.00013, 0.00012, 0.00014, 3e-05, 0.0, -6e-05, -0.00018, -0.00016, -0.00021, -0.00018]  # fmt: skip
-
-        set_seed(0)
-        output = model.generate(**self.input_audio, num_beams=1, tgt_lang="rus", return_intermediate_token_ids=True)
-
-        self.assertListEqual(expected_text_tokens, output.sequences.squeeze().tolist())
-        self.assertListEqual(expected_unit_tokens[:10], output.unit_sequences.squeeze().tolist()[:10])
-
-        self.assertListAlmostEqual(expected_wav_slice, output.waveform.squeeze().tolist()[50:60])
-
-    @slow
-    def test_text_to_text_model(self):
-        kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True, "generate_speech": False}
-        kwargs2 = {
-            "tgt_lang": "eng",
-            "output_hidden_states": True,
-            "return_dict_in_generate": True,
-            "output_scores": True,
-        }
-        self.factory_test_task(SeamlessM4TModel, SeamlessM4TForTextToText, self.input_text, kwargs1, kwargs2)
-
-    @slow
-    def test_speech_to_text_model(self):
-        kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True, "generate_speech": False}
-        kwargs2 = {
-            "tgt_lang": "eng",
-            "output_hidden_states": True,
-            "return_dict_in_generate": True,
-            "output_scores": True,
-        }
-        self.factory_test_task(SeamlessM4TModel, SeamlessM4TForSpeechToText, self.input_audio, kwargs1, kwargs2)
-
-    @slow
-    def test_speech_to_speech_model(self):
-        kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True}
-        self.factory_test_task(SeamlessM4TModel, SeamlessM4TForSpeechToSpeech, self.input_audio, kwargs1, kwargs1)
-
-    @slow
-    def test_text_to_speech_model(self):
-        kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True}
-
-        self.factory_test_task(SeamlessM4TModel, SeamlessM4TForTextToSpeech, self.input_text, kwargs1, kwargs1)
\ No newline at end of file
diff --git a/tests/transformers/models/seamless_m4t_v2/__init__.py b/tests/transformers/models/seamless_m4t_v2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/transformers/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
deleted file mode 100644
index 99bdcd540..000000000
--- a/tests/transformers/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ /dev/null
@@ -1,1187 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch SeamlessM4Tv2 model."""
-
-import copy
-import tempfile
-import unittest
-
-from mindnlp.transformers import SeamlessM4Tv2Config, is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-from mindnlp.engine import set_seed
-from mindnlp.utils import cached_property
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        SeamlessM4Tv2ForSpeechToSpeech,
-        SeamlessM4Tv2ForSpeechToText,
-        SeamlessM4Tv2ForTextToSpeech,
-        SeamlessM4Tv2ForTextToText,
-        SeamlessM4Tv2Model,
-    )
-
-    from mindnlp.transformers import SeamlessM4TProcessor
-
-
-class SeamlessM4Tv2ModelTester:
-    def __init__(
-        self,
-        parent,
-        input_modality="speech",
-        batch_size=2,
-        seq_length=4,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        initializer_range=0.02,
-        max_new_tokens=None,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        vocab_size=20,
-        t2u_vocab_size=20,
-        hidden_size=6,
-        num_hidden_layers=2,
-        intermediate_size=6,
-        max_position_embeddings=256,
-        encoder_layers=2,
-        decoder_layers=2,
-        encoder_ffn_dim=6,
-        decoder_ffn_dim=6,
-        t2u_encoder_layers=2,
-        t2u_decoder_layers=2,
-        t2u_encoder_ffn_dim=6,
-        t2u_decoder_ffn_dim=6,
-        num_heads=2,
-        vocoder_num_spkrs=5,
-        vocoder_num_langs=5,
-        upsample_initial_channel=32,
-        unit_embed_dim=25,
-        spkr_embed_dim=6,
-        lang_embed_dim=6,
-        num_conv_pos_embeddings=8,
-        unit_hifi_gan_vocab_size=20,
-        t2u_num_langs=0,
-        t2u_offset_tgt_lang=0,
-        vocoder_offset=0,
-        t2u_variance_predictor_hidden_dim=4,
-        char_vocab_size=4,
-        left_max_position_embeddings=2,
-        right_max_position_embeddings=1,
-        speech_encoder_chunk_size=2,
-        speech_encoder_left_chunk_num=1,
-    ):
-        self.parent = parent
-        self.input_modality = input_modality
-
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-        self.vocab_size = vocab_size
-        self.t2u_vocab_size = t2u_vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.intermediate_size = intermediate_size
-        self.max_position_embeddings = max_position_embeddings
-        self.encoder_layers = encoder_layers
-        self.decoder_layers = decoder_layers
-        self.encoder_ffn_dim = encoder_ffn_dim
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.t2u_encoder_layers = t2u_encoder_layers
-        self.t2u_decoder_layers = t2u_decoder_layers
-        self.t2u_encoder_ffn_dim = t2u_encoder_ffn_dim
-        self.t2u_decoder_ffn_dim = t2u_decoder_ffn_dim
-        self.num_heads = num_heads
-        self.num_attention_heads = num_heads
-
-        self.vocoder_num_spkrs = vocoder_num_spkrs
-        self.vocoder_num_langs = vocoder_num_langs
-        self.upsample_initial_channel = upsample_initial_channel
-        self.unit_embed_dim = unit_embed_dim
-        self.spkr_embed_dim = spkr_embed_dim
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.lang_embed_dim = lang_embed_dim
-
-        self.max_new_tokens = max_new_tokens
-
-        self.unit_hifi_gan_vocab_size = unit_hifi_gan_vocab_size
-        self.t2u_num_langs = t2u_num_langs
-        self.t2u_offset_tgt_lang = t2u_offset_tgt_lang
-        self.vocoder_offset = vocoder_offset
-
-        self.t2u_variance_predictor_hidden_dim = t2u_variance_predictor_hidden_dim
-        self.char_vocab_size = char_vocab_size
-        self.left_max_position_embeddings = left_max_position_embeddings
-        self.right_max_position_embeddings = right_max_position_embeddings
-        self.speech_encoder_chunk_size = speech_encoder_chunk_size
-        self.speech_encoder_left_chunk_num = speech_encoder_left_chunk_num
-
-    def prepare_config_and_inputs(self):
-        if self.input_modality == "text":
-            inputs = ids_tensor([self.batch_size, self.seq_length], self.vocab_size - 1)
-        else:
-            inputs = ids_tensor([self.batch_size, self.seq_length, 160], self.vocab_size - 1).float()
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size - 1)
-
-        lm_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-
-        config = self.get_config()
-
-        return config, inputs, decoder_input_ids, input_mask, lm_labels
-
-    def get_config(self):
-        return SeamlessM4Tv2Config(
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            t2u_vocab_size=self.t2u_vocab_size,
-            hidden_size=self.hidden_size,
-            speech_encoder_layers=self.num_heads,
-            speech_encoder_intermediate_size=self.intermediate_size,
-            max_position_embeddings=self.max_position_embeddings,
-            encoder_layers=self.encoder_layers,
-            decoder_layers=self.decoder_layers,
-            encoder_ffn_dim=self.encoder_ffn_dim,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            t2u_encoder_layers=self.t2u_encoder_layers,
-            t2u_decoder_layers=self.t2u_decoder_layers,
-            t2u_encoder_ffn_dim=self.t2u_encoder_ffn_dim,
-            t2u_decoder_ffn_dim=self.t2u_decoder_ffn_dim,
-            num_attention_heads=self.num_heads,
-            encoder_attention_heads=self.num_heads,
-            decoder_attention_heads=self.num_heads,
-            t2u_encoder_attention_heads=self.num_heads,
-            t2u_decoder_attention_heads=self.num_heads,
-            speech_encoder_attention_heads=self.num_heads,
-            unit_hifigan_vocab_vise=self.t2u_vocab_size,
-            vocoder_num_spkrs=self.vocoder_num_spkrs,
-            vocoder_num_langs=self.vocoder_num_langs,
-            upsample_initial_channel=self.upsample_initial_channel,
-            unit_embed_dim=self.unit_embed_dim,
-            spkr_embed_dim=self.spkr_embed_dim,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            lang_embed_dim=self.lang_embed_dim,
-            max_new_tokens=self.max_new_tokens,
-            unit_hifi_gan_vocab_size=self.unit_hifi_gan_vocab_size,
-            t2u_num_langs=self.t2u_num_langs,
-            t2u_offset_tgt_lang=self.t2u_offset_tgt_lang,
-            vocoder_offset=self.vocoder_offset,
-            t2u_variance_predictor_embed_dim=self.hidden_size,
-            t2u_variance_predictor_hidden_dim=self.t2u_variance_predictor_hidden_dim,
-            char_vocab_size=self.char_vocab_size,
-            left_max_position_embeddings=self.left_max_position_embeddings,
-            right_max_position_embeddings=self.right_max_position_embeddings,
-            speech_encoder_chunk_size=self.speech_encoder_chunk_size,
-            speech_encoder_left_chunk_num=self.speech_encoder_left_chunk_num,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            input_mask,
-            lm_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            decoder_input_ids,
-            input_mask,
-            lm_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(self, config, input_ids, decoder_input_ids, input_mask, labels):
-        model = SeamlessM4Tv2Model(config=config)
-        model.eval()
-        if self.input_modality == "text":
-            result = model(input_ids=input_ids, attention_mask=input_mask, decoder_input_ids=decoder_input_ids)
-            result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        else:
-            result = model(input_features=input_ids, attention_mask=input_mask, decoder_input_ids=decoder_input_ids)
-            result = model(input_features=input_ids, decoder_input_ids=decoder_input_ids)
-            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-        decoder_output = result.logits
-        decoder_past = result.past_key_values
-        encoder_output = result.encoder_last_hidden_state
-
-        if self.input_modality == "text":
-            seq_length = self.seq_length
-        else:
-            # if speech, expected length has been subsampled.
-            seq_length = model._compute_sub_sample_lengths_from_attention_mask(input_mask).max().item()
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, seq_length, self.hidden_size))
-        self.parent.assertEqual(decoder_output.shape, (self.batch_size, decoder_input_ids.shape[1], self.vocab_size))
-        # There should be `num_layers` key value embeddings stored in decoder_past
-        self.parent.assertEqual(len(decoder_past), config.decoder_layers)
-        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
-        self.parent.assertEqual(len(decoder_past[0]), 4)
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        input_mask,
-        lm_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        model = SeamlessM4Tv2Model(config=config)
-        model.eval()
-
-        # make sure no pad token in decoder_input_ids
-        decoder_input_ids = ops.clamp(decoder_input_ids, config.pad_token_id + 1)
-
-        # first forward pass
-        outputs = model(
-            input_ids, decoder_input_ids=decoder_input_ids, decoder_attention_mask=input_mask, use_cache=True
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([decoder_input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            input_ids,
-            decoder_input_ids=next_input_ids,
-            decoder_attention_mask=next_attention_mask,
-            output_hidden_states=True,
-        )
-        output_from_no_past = output_from_no_past["decoder_hidden_states"][0]
-        output_from_past = model(
-            input_ids,
-            decoder_input_ids=next_tokens,
-            decoder_attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["decoder_hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            input_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        input_name = "input_ids" if self.input_modality == "text" else "input_features"
-
-        inputs_dict = {
-            input_name: input_ids,
-            "attention_mask": input_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "labels": lm_labels,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class SeamlessM4Tv2ModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase):
-    is_encoder_decoder = True
-    fx_compatible = False
-    test_missing_keys = False
-    test_pruning = False
-    test_model_parallel = False
-    test_resize_embeddings = False
-    test_headmasking = False
-    test_torchscript = False
-
-    all_model_classes = (
-        (
-            SeamlessM4Tv2Model,
-            SeamlessM4Tv2ForSpeechToSpeech,
-            SeamlessM4Tv2ForSpeechToText,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (SeamlessM4Tv2ForSpeechToText,) if is_mindspore_available() else ()
-
-    input_name = "input_features"
-
-    def setUp(self):
-        self.model_tester = SeamlessM4Tv2ModelTester(self, input_modality="speech")
-        self.config_tester = ConfigTester(self, config_class=SeamlessM4Tv2Config)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/seamless-m4t-v2-large"
-        model = SeamlessM4Tv2Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict[self.input_name]
-
-        # cut to half length & take max batch_size 3
-        sequence_length = input_ids.shape[-1] // 2
-        input_ids = input_ids[:batch_size, :sequence_length]
-
-        # generate max 3 tokens
-        max_length = input_ids.shape[-1] + 3
-        if config.eos_token_id is not None and config.pad_token_id is None:
-            # hack to allow generate for models such as GPT2 as is done in `generate()`
-            if isinstance(config.eos_token_id, int):
-                config.eos_token_id = [config.eos_token_id]
-            config.pad_token_id = config.eos_token_id[0]
-
-        attention_mask = ops.ones(input_ids.shape[:2], dtype=mindspore.int64)[:batch_size, :sequence_length]
-
-        return config, input_ids.float(), attention_mask, max_length
-
-    @staticmethod
-    def _get_encoder_outputs(
-        model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
-    ):
-        encoder = model.get_encoder()
-        encoder_outputs = encoder(
-            input_ids,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-        )
-        encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave(
-            num_interleave, dim=0
-        )
-        generation_config = copy.deepcopy(model.generation_config)
-        model._prepare_special_tokens(generation_config)
-        input_ids = (
-            ops.zeros(input_ids.shape[:2], dtype=mindspore.int64)
-            + generation_config.decoder_start_token_id
-        )
-        attention_mask = None
-        return encoder_outputs, input_ids, attention_mask
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "pos_bias_v",
-                    "pos_bias_u",
-                    "pointwise_conv1",
-                    "pointwise_conv2",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                    "adapter",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    @unittest.skip(reason="SeamlessM4Tv2SpeechEncoder doesn't have an embedding layer")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="SeamlessM4TSpeechEncoder doesn't have an embedding layer")
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    @unittest.skip(
-        reason="Expected missing keys serve when using SeamlessM4Tv2ForXXX.from_pretrained from a checkpoint saved by SeamlessM4Tv2Model.save_pretrained."
-    )
-    def test_model_weights_reload_no_missing_tied_weights(self):
-        pass
-
-    @unittest.skip(
-        reason="SeamlessM4Tv2Model is base class but has actually a bigger architecture than seamlessM4T task-specific models."
-    )
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(reason="SeamlessM4Tv2Model can takes input_ids or input_features")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip(reason="SeamlessM4Tv2 has no base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
-    )
-    def test_load_save_without_tied_weights(self):
-        pass
-
-    def test_attention_outputs(self):
-        # expected length is subsampled so need to change a bit this test
-        if not self.has_attentions:
-            self.skipTest(reason="Model does not output attentions")
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        # no more chunk_length test
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            if self.is_encoder_decoder:
-                correct_outlen = 5
-
-                # loss is at first position
-                if "labels" in inputs_dict:
-                    correct_outlen += 1  # loss is added to beginning
-                if "past_key_values" in outputs:
-                    correct_outlen += 1  # past_key_values have been returned
-
-                self.assertEqual(out_len, correct_outlen)
-
-                # decoder attentions
-                decoder_attentions = outputs.decoder_attentions
-                self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
-
-                # cross attentions
-                cross_attentions = outputs.cross_attentions
-                self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-
-                sub_sampled_length = (
-                    model._compute_sub_sample_lengths_from_attention_mask(inputs_dict["attention_mask"]).max().item()
-                )
-                self.assertListEqual(
-                    list(cross_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        sub_sampled_length,
-                    ],
-                )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-            )
-
-
-@require_mindspore
-class SeamlessM4Tv2ModelWithTextInputTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    is_encoder_decoder = True
-    fx_compatible = False
-    test_missing_keys = False
-    test_pruning = False
-    test_model_parallel = False
-    test_resize_embeddings = True
-    test_headmasking = False
-    test_torchscript = False
-
-    all_model_classes = (
-        (
-            SeamlessM4Tv2Model,
-            SeamlessM4Tv2ForTextToSpeech,
-            SeamlessM4Tv2ForTextToText,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (SeamlessM4Tv2ForTextToText,) if is_mindspore_available() else ()
-
-    def setUp(self):
-        self.model_tester = SeamlessM4Tv2ModelTester(self, input_modality="text")
-        self.config_tester = ConfigTester(self, config_class=SeamlessM4Tv2Config)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/seamless-m4t-v2-large"
-        model = SeamlessM4Tv2Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "pos_bias_v",
-                    "pos_bias_u",
-                    "pointwise_conv1",
-                    "pointwise_conv2",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                    "adapter",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    @unittest.skip(
-        reason="Expected missing keys serve when using SeamlessM4Tv2ForXXX.from_pretrained from a checkpoint saved by SeamlessM4Tv2Model.save_pretrained."
-    )
-    def test_model_weights_reload_no_missing_tied_weights(self):
-        pass
-
-    @unittest.skip(reason="SeamlessM4Tv2Model can take input_ids or input_features")
-    def test_forward_signature(self):
-        pass
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    @unittest.skip(
-        reason="SeamlessM4Tv2Model is base class but has actually a bigger architecture than seamlessM4T task-specific models."
-    )
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(reason="SeamlessM4Tv2 has no base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
-    )
-    def test_load_save_without_tied_weights(self):
-        pass
-
-
-@require_mindspore
-class SeamlessM4Tv2GenerationTest(unittest.TestCase):
-    # test that non-standard generation works
-    # test generation of: SeamlessM4Tv2Model, SeamlessM4Tv2ForSpeechToSpeech, SeamlessM4Tv2ForSpeechToText, SeamlessM4Tv2ForTextToSpeech
-
-    def setUp(self):
-        self.speech_model_tester = SeamlessM4Tv2ModelTester(self, input_modality="speech")
-        self.text_model_tester = SeamlessM4Tv2ModelTester(self, input_modality="text")
-        self.tmpdirname = tempfile.mkdtemp()
-
-    def update_generation(self, model):
-        text_lang_code_to_id = {
-            "fra": 4,
-            "eng": 4,
-            "rus": 4,
-        }
-
-        speech_lang_code_to_id = {
-            "fra": 4,
-            "eng": 4,
-        }
-
-        id_to_text = {str(i): "a" for i in range(model.config.vocab_size)}
-        id_to_text["0"] = "ab"
-        id_to_text["1"] = "_b"
-        id_to_text["3"] = ","
-        id_to_text["4"] = "_cd"
-
-        char_to_id = {char: i for (i, char) in enumerate("abcd")}
-
-        generation_config = copy.deepcopy(model.generation_config)
-
-        generation_config.__setattr__("text_decoder_lang_to_code_id", text_lang_code_to_id)
-        generation_config.__setattr__("t2u_lang_code_to_id", speech_lang_code_to_id)
-        generation_config.__setattr__("vocoder_lang_code_to_id", speech_lang_code_to_id)
-        generation_config.__setattr__("id_to_text", id_to_text)
-        generation_config.__setattr__("char_to_id", char_to_id)
-        generation_config.__setattr__("eos_token_id", 0)
-
-        generation_config._from_model_config = False
-
-        model.generation_config = generation_config
-
-    def prepare_text_input(self, tgt_lang):
-        config, inputs, decoder_input_ids, input_mask, lm_labels = self.text_model_tester.prepare_config_and_inputs()
-
-        input_dict = {
-            "input_ids": inputs,
-            "attention_mask": input_mask,
-            "tgt_lang": tgt_lang,
-            "num_beams": 2,
-            "do_sample": True,
-        }
-
-        return config, input_dict
-
-    def prepare_speech_input(self):
-        config, inputs, decoder_input_ids, input_mask, lm_labels = self.speech_model_tester.prepare_config_and_inputs()
-
-        input_dict = {
-            "input_features": inputs,
-            "attention_mask": input_mask,
-            "tgt_lang": "fra",
-            "num_beams": 2,
-            "do_sample": True,
-        }
-
-        return config, input_dict
-
-    def prepare_speech_and_text_input(self):
-        config, inputs, decoder_input_ids, input_mask, lm_labels = self.speech_model_tester.prepare_config_and_inputs()
-
-        input_speech = {
-            "input_features": inputs,
-            "attention_mask": input_mask,
-            "tgt_lang": "fra",
-            "num_beams": 2,
-            "do_sample": True,
-        }
-
-        config, inputs, decoder_input_ids, input_mask, lm_labels = self.text_model_tester.prepare_config_and_inputs()
-
-        input_text = {
-            "input_ids": inputs,
-            "attention_mask": input_mask,
-            "tgt_lang": "eng",
-            "num_beams": 2,
-            "do_sample": True,
-        }
-        return config, input_speech, input_text
-
-    def factory_generation_speech_test(self, model, inputs):
-        set_seed(0)
-        output = model.generate(**inputs)
-        return output
-
-    def test_generation_languages(self):
-        config, input_text_rus = self.prepare_text_input(tgt_lang="rus")
-
-        model = SeamlessM4Tv2Model(config=config)
-        self.update_generation(model)
-        model.eval()
-
-        # make sure that generating speech, with a language that is only supported for text translation, raises error
-        with self.assertRaises(ValueError):
-            model.generate(**input_text_rus)
-
-        # make sure that generating text only works
-        model.generate(**input_text_rus, generate_speech=False)
-
-        # make sure it works for languages supported by both output modalities
-        config, input_text_eng = self.prepare_text_input(tgt_lang="eng")
-        model.generate(**input_text_eng)
-        model.generate(**input_text_eng, generate_speech=False)
-
-    def test_speech_generation(self):
-        config, input_speech, input_text = self.prepare_speech_and_text_input()
-
-        model = SeamlessM4Tv2Model(config=config)
-        self.update_generation(model)
-        model.save_pretrained(self.tmpdirname)
-        model.eval()
-
-        output_original_text = self.factory_generation_speech_test(model, input_text)
-        output_original_speech = self.factory_generation_speech_test(model, input_speech)
-
-        state_dict = model.state_dict()
-
-        text_model = SeamlessM4Tv2ForTextToSpeech.from_pretrained(self.tmpdirname)
-        self.update_generation(text_model)
-        text_model.eval()
-
-        output_text = self.factory_generation_speech_test(model, input_text)
-
-        speech_model = SeamlessM4Tv2ForSpeechToSpeech.from_pretrained(self.tmpdirname)
-        self.update_generation(speech_model)
-        speech_model.eval()
-
-        for name, tensor in speech_model.state_dict().items():
-            right_tensor = state_dict.get(name)
-            self.assertEqual(tensor.tolist(), right_tensor.tolist(), f"Tensor {name}")
-
-        output_speech = self.factory_generation_speech_test(model, input_speech)
-
-        # test same text output from input text
-        self.assertListEqual(output_original_text[0].ravel().tolist(), output_text[0].ravel().tolist())
-        self.assertListEqual(output_original_text[1].ravel().tolist(), output_text[1].ravel().tolist())
-
-        # test same speech output from input text
-        # assertTrue because super long list makes this hang in case of failure
-        self.assertTrue(
-            output_original_speech[0].ravel().tolist() == output_speech[0].ravel().tolist(),
-            "Speech generated was different",
-        )
-        self.assertTrue(
-            output_original_speech[1].ravel().tolist() == output_speech[1].ravel().tolist(),
-            "Speech generated was different",
-        )
-
-    def test_text_generation(self):
-        config, input_speech, input_text = self.prepare_speech_and_text_input()
-
-        # to return speech
-        input_speech["generate_speech"] = False
-        input_text["generate_speech"] = False
-
-        model = SeamlessM4Tv2Model(config=config)
-        self.update_generation(model)
-        model.save_pretrained(self.tmpdirname)
-        model.eval()
-
-        output_original_text = self.factory_generation_speech_test(model, input_text)
-        output_original_speech = self.factory_generation_speech_test(model, input_speech)
-
-        # other models don't need it
-        input_speech.pop("generate_speech")
-        input_text.pop("generate_speech")
-
-        state_dict = model.state_dict()
-
-        text_model = SeamlessM4Tv2ForTextToText.from_pretrained(self.tmpdirname)
-        self.update_generation(text_model)
-        text_model.eval()
-
-        for name, tensor in text_model.state_dict().items():
-            right_tensor = state_dict.get(name)
-            self.assertEqual(tensor.tolist(), right_tensor.tolist())
-
-        output_text = self.factory_generation_speech_test(text_model, input_text)
-
-        speech_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(self.tmpdirname)
-
-        for name, tensor in speech_model.state_dict().items():
-            right_tensor = state_dict.get(name)
-            self.assertEqual(tensor.tolist(), right_tensor.tolist(), f"Tensor {name}")
-
-        self.update_generation(speech_model)
-        speech_model.eval()
-
-        output_speech = self.factory_generation_speech_test(speech_model, input_speech)
-
-        # test same text output from input text
-        self.assertListEqual(output_original_text[0].ravel().tolist(), output_text.ravel().tolist())
-
-        # test same speech output from input text
-        self.assertListEqual(output_original_speech[0].ravel().tolist(), output_speech.ravel().tolist())
-
-    def test_generation(self):
-        config, input_speech, input_text = self.prepare_speech_and_text_input()
-
-        input_speech["num_beams"] = 3
-        input_speech["do_sample"] = True
-        input_speech["temperature"] = 0.5
-        input_speech["num_return_sequences"] = 3
-
-        input_text["num_beams"] = 3
-        input_text["do_sample"] = True
-        input_text["temperature"] = 0.5
-        input_text["num_return_sequences"] = 3
-
-        for model_class in [SeamlessM4Tv2ForSpeechToSpeech, SeamlessM4Tv2ForSpeechToText, SeamlessM4Tv2Model]:
-            model = model_class(config=config)
-            self.update_generation(model)
-            model.eval()
-
-            output = model.generate(**input_speech)
-            output = output[0] if isinstance(output, tuple) else output
-
-            self.assertEqual(output.shape[0], 3 * input_speech["input_features"].shape[0])
-
-        for model_class in [SeamlessM4Tv2ForTextToSpeech, SeamlessM4Tv2ForTextToText, SeamlessM4Tv2Model]:
-            model = model_class(config=config)
-            self.update_generation(model)
-            model.eval()
-
-            output = model.generate(**input_text)
-
-            output = output[0] if isinstance(output, tuple) else output
-
-            self.assertEqual(output.shape[0], 3 * input_text["input_ids"].shape[0])
-
-
-@require_mindspore
-class SeamlessM4Tv2ModelIntegrationTest(unittest.TestCase):
-    repo_id = "facebook/seamless-m4t-v2-large"
-
-    def assertListAlmostEqual(self, list1, list2, tol=1e-4):
-        self.assertEqual(len(list1), len(list2))
-        for a, b in zip(list1, list2):
-            self.assertAlmostEqual(a, b, delta=tol)
-
-    @cached_property
-    def processor(self):
-        return SeamlessM4TProcessor.from_pretrained(self.repo_id)
-
-    @cached_property
-    def input_text(self):
-        # corresponds to "C'est un test." with seamlessM4T_medium checkpoint
-
-        input_ids = mindspore.tensor([[256026, 109, 247729, 171, 128, 6816, 247676, 3]])  # fmt: skip
-
-        attention_mask = ops.ones_like(input_ids)
-
-        inputs = {
-            "attention_mask": attention_mask,
-            "input_ids": input_ids,
-        }
-
-        return inputs
-
-    @cached_property
-    def input_audio(self):
-        set_seed(0)
-        seq_len = 20000
-        sampling_rate = 16000
-        input_features = ops.rand((2, seq_len))
-
-        return self.processor(audios=[input_features.tolist()], sampling_rate=sampling_rate, return_tensors="ms")
-
-    def factory_test_task(self, class1, class2, inputs, class1_kwargs, class2_kwargs):
-        # half-precision loading to limit GPU usage
-        model1 = class1.from_pretrained(self.repo_id, torch_dtype=mindspore.float16)
-        model2 = class2.from_pretrained(self.repo_id, torch_dtype=mindspore.float16)
-
-        set_seed(0)
-        output_1 = model1.generate(**inputs, **class1_kwargs)
-        set_seed(0)
-        output_2 = model2.generate(**inputs, **class2_kwargs)
-
-        for key in output_1:
-            if isinstance(output_1[key], mindspore.Tensor):
-                if len(output_1[key].shape) == 0:
-                    self.assertEqual(output_1[key].item(), output_2[key].item())
-                else:
-                    self.assertListAlmostEqual(output_1[key].squeeze().tolist(), output_2[key].squeeze().tolist())
-
-    @slow
-    def test_to_eng_text(self):
-        model = SeamlessM4Tv2Model.from_pretrained(self.repo_id)
-
-        # test text - tgt lang: eng
-
-        expected_text_tokens = [3, 256022, 3080, 1, 247669, 10, 6816, 247676, 3]  # fmt: skip
-
-        # fmt: off
-        expected_unit_tokens = [
-            4746,7163,8208,8208,1315,1266,4307,1119,989,9594,3007,3007,4341,5205,7631,7631,3202,4061,9092,3191,7509,1715,
-            5280,5280,3554,8812,8197,6366,5382,5382,7330,2758,9433,9433,6863,7510,5800,5800,5286,1948,1825,1825,3956,8724,
-            8724,5331,8914,9315,9315,5288,2588,8167,8787,8787,8063,6008,2621,2621,2621,5696
-        ]
-        # fmt: on
-
-        expected_wav_slice = [9.485097e-04, 8.320558e-04, 7.178137e-04, 9.349979e-04, 1.121628e-03, 1.091766e-03, 1.279693e-03, 1.387754e-03, 1.296396e-03, 1.143557e-03]  # fmt: skip
-
-        set_seed(0)
-        output = model.generate(**self.input_text, num_beams=1, tgt_lang="eng", return_intermediate_token_ids=True)
-
-        self.assertListEqual(expected_text_tokens, output.sequences.squeeze().tolist())
-        self.assertListEqual(
-            expected_unit_tokens, (output.unit_sequences - model.config.vocoder_offset).squeeze().tolist()
-        )
-
-        self.assertListAlmostEqual(expected_wav_slice, output.waveform.squeeze().tolist()[50:60])
-
-        # assert mean and std equality
-        self.assertListAlmostEqual(
-            [-2.349690e-04, 9.920777e-02], [output.waveform.mean().item(), output.waveform.std().item()]
-        )
-
-    @slow
-    @unittest.skip(reason="Equivalence is broken since a new update")
-    def test_to_swh_text(self):
-        model = SeamlessM4Tv2Model.from_pretrained(self.repo_id)
-
-        # test text - tgt lang: swh
-
-        expected_text_tokens = [3, 256084, 109, 247729, 171, 10, 6816, 247676, 3]  # fmt: skip
-
-        # fmt: off
-        expected_unit_tokens = [
-            5725,7163,7472,7472,6915,3099,3099,9921,2765,6515,6515,1374,1374,1347,8252,9854,9854,5662,2420,6600,2216,4503,
-            7208,6107,6107,7298,9123,6472,9663,9663,6366,6366,6445,575,3575,2052,2052,5788,5800,5800,5286,5286,1825,1825,3956,
-            3956,8724,8724,5331,8914,8914,9315,9315,2821,8167,8167,8787,8787,8787,8700,8700,8700,2175,2175,3196,3196,2621,1725,
-            1725,7507,5696
-        ]
-        # fmt: on
-
-        expected_wav_slice = [3.124037e-04, 2.450471e-04, 2.286572e-04, 2.317214e-04, 2.732605e-04, 2.478790e-04, 2.704144e-04, 2.665847e-04, 2.828784e-04, 2.684390e-04]  # fmt: skip
-
-        set_seed(0)
-        output = model.generate(**self.input_text, num_beams=1, tgt_lang="swh", return_intermediate_token_ids=True)
-
-        self.assertListEqual(expected_text_tokens, output.sequences.squeeze().tolist())
-        self.assertListEqual(
-            expected_unit_tokens, (output.unit_sequences - model.config.vocoder_offset).squeeze().tolist()
-        )
-
-        self.assertListAlmostEqual(expected_wav_slice, output.waveform.squeeze().tolist()[50:60])
-
-        # assert mean and std equality
-        self.assertListAlmostEqual(
-            [-2.001826e-04, 8.580012e-02], [output.waveform.mean().item(), output.waveform.std().item()]
-        )
-
-    @slow
-    def test_to_rus_speech(self):
-        model = SeamlessM4Tv2Model.from_pretrained(self.repo_id)
-
-        # test audio - tgt lang: rus
-
-        expected_text_tokens = [3, 256074, 107, 248213, 404, 247792, 247789, 3]  # fmt: skip
-
-        # fmt: off
-        expected_unit_tokens = [
-            8976,7163,6915,2728,2728,5198,3318,3318,3686,1049,9643,1200,2052,2052,8196,8196,7624,7624,7555,7555,7555,7555,
-            9717,9717,4869,8167,8167,8167,8053,972,9362,8167,297,297,297,3993,3993,3993,3993,4660,4660,4660,4660,4660,4660,
-            7962,7962,225,225,8737,4199
-        ]
-        # fmt: on
-
-        expected_wav_slice = [1.415287e-03, 1.360976e-03, 1.297727e-03, 1.305321e-03, 1.352087e-03, 1.283812e-03, 1.352623e-03, 1.387384e-03, 1.449627e-03, 1.411701e-03]  # fmt: skip
-
-        set_seed(0)
-        output = model.generate(**self.input_audio, num_beams=1, tgt_lang="rus", return_intermediate_token_ids=True)
-
-        self.assertListEqual(expected_text_tokens, output.sequences.squeeze().tolist())
-        self.assertListEqual(
-            expected_unit_tokens, (output.unit_sequences - model.config.vocoder_offset).squeeze().tolist()
-        )
-
-        self.assertListAlmostEqual(expected_wav_slice, output.waveform.squeeze().tolist()[50:60])
-
-        # assert mean and std equality - higher tolerance for speech
-        self.assertListAlmostEqual(
-            [-2.818016e-04, 7.169888e-02], [output.waveform.mean().item(), output.waveform.std().item()], tol=5e-4
-        )
-
-    @slow
-    def test_text_to_text_model(self):
-        kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True, "generate_speech": False}
-        kwargs2 = {
-            "tgt_lang": "eng",
-            "output_hidden_states": True,
-            "return_dict_in_generate": True,
-            "output_scores": True,
-        }
-        self.factory_test_task(SeamlessM4Tv2Model, SeamlessM4Tv2ForTextToText, self.input_text, kwargs1, kwargs2)
-
-    @slow
-    def test_speech_to_text_model(self):
-        kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True, "generate_speech": False}
-        kwargs2 = {
-            "tgt_lang": "eng",
-            "output_hidden_states": True,
-            "return_dict_in_generate": True,
-            "output_scores": True,
-        }
-        self.factory_test_task(SeamlessM4Tv2Model, SeamlessM4Tv2ForSpeechToText, self.input_audio, kwargs1, kwargs2)
-
-    @slow
-    def test_speech_to_speech_model(self):
-        kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True}
-        self.factory_test_task(SeamlessM4Tv2Model, SeamlessM4Tv2ForSpeechToSpeech, self.input_audio, kwargs1, kwargs1)
-
-    @slow
-    def test_text_to_speech_model(self):
-        kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True}
-
-        self.factory_test_task(SeamlessM4Tv2Model, SeamlessM4Tv2ForTextToSpeech, self.input_text, kwargs1, kwargs1)
diff --git a/tests/transformers/models/segformer/__init__.py b/tests/transformers/models/segformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/segformer/test_modeling_segformer.py b/tests/transformers/models/segformer/test_modeling_segformer.py
deleted file mode 100644
index 8eb46f66a..000000000
--- a/tests/transformers/models/segformer/test_modeling_segformer.py
+++ /dev/null
@@ -1,416 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore SegFormer model. """
-
-
-import unittest
-import numpy as np
-from mindnlp.transformers import SegformerConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        SegformerForImageClassification,
-        SegformerForSemanticSegmentation,
-        SegformerModel,
-    )
-    from mindnlp.transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import SegformerImageProcessor
-
-
-
-class SegformerConfigTester(ConfigTester):
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, "hidden_sizes"))
-        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
-        self.parent.assertTrue(hasattr(config, "num_encoder_blocks"))
-
-
-class SegformerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=64,
-        num_channels=3,
-        num_encoder_blocks=4,
-        depths=[1, 1, 1, 1],
-        sr_ratios=[8, 4, 2, 1],
-        hidden_sizes=[8, 8, 16, 16],
-        downsampling_rates=[1, 4, 8, 16],
-        num_attention_heads=[1, 1, 2, 2],
-        is_training=True,
-        use_labels=True,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.num_encoder_blocks = num_encoder_blocks
-        self.sr_ratios = sr_ratios
-        self.depths = depths
-        self.hidden_sizes = hidden_sizes
-        self.downsampling_rates = downsampling_rates
-        self.num_attention_heads = num_attention_heads
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
-
-        config = self.get_config()
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return SegformerConfig(
-            image_size=self.image_size,
-            num_channels=self.num_channels,
-            num_encoder_blocks=self.num_encoder_blocks,
-            depths=self.depths,
-            hidden_sizes=self.hidden_sizes,
-            num_attention_heads=self.num_attention_heads,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = SegformerModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        expected_height = expected_width = self.image_size // (self.downsampling_rates[-1] * 2)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.hidden_sizes[-1], expected_height, expected_width)
-        )
-
-    def create_and_check_for_image_segmentation(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = SegformerForSemanticSegmentation(config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size // 4, self.image_size // 4)
-        )
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size // 4, self.image_size // 4)
-        )
-        self.parent.assertGreater(result.loss, 0.0)
-
-    def create_and_check_for_binary_image_segmentation(self, config, pixel_values, labels):
-        config.num_labels = 1
-        model = SegformerForSemanticSegmentation(config=config)
-        model.set_train(False)
-        labels = ops.randint(0, 1, (self.batch_size, self.image_size, self.image_size))
-        result = model(pixel_values, labels=labels)
-        self.parent.assertGreater(result.loss, 0.0)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class SegformerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            SegformerModel,
-            SegformerForSemanticSegmentation,
-            SegformerForImageClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "image-feature-extraction": SegformerModel,
-            "image-classification": SegformerForImageClassification,
-            "image-segmentation": SegformerForSemanticSegmentation,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    fx_compatible = True
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-
-    def setUp(self):
-        self.model_tester = SegformerModelTester(self)
-        self.config_tester = SegformerConfigTester(self, config_class=SegformerConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_binary_image_segmentation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_binary_image_segmentation(*config_and_inputs)
-
-    def test_for_image_segmentation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_segmentation(*config_and_inputs)
-
-    @unittest.skip("SegFormer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip("SegFormer does not have get_input_embeddings method and get_output_embeddings methods")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-
-            expected_num_attentions = sum(self.model_tester.depths)
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            # verify the first attentions (first block, first layer)
-            expected_seq_len = (self.model_tester.image_size // 4) ** 2
-            expected_reduced_seq_len = (self.model_tester.image_size // (4 * self.model_tester.sr_ratios[0])) ** 2
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads[0], expected_seq_len, expected_reduced_seq_len],
-            )
-
-            # verify the last attentions (last block, last layer)
-            expected_seq_len = (self.model_tester.image_size // 32) ** 2
-            expected_reduced_seq_len = (self.model_tester.image_size // (32 * self.model_tester.sr_ratios[-1])) ** 2
-            self.assertListEqual(
-                list(attentions[-1].shape[-3:]),
-                [self.model_tester.num_attention_heads[-1], expected_seq_len, expected_reduced_seq_len],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            self.assertEqual(out_len + 1, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), expected_num_attentions)
-            # verify the first attentions (first block, first layer)
-            expected_seq_len = (self.model_tester.image_size // 4) ** 2
-            expected_reduced_seq_len = (self.model_tester.image_size // (4 * self.model_tester.sr_ratios[0])) ** 2
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads[0], expected_seq_len, expected_reduced_seq_len],
-            )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_layers = self.model_tester.num_encoder_blocks
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            # verify the first hidden states (first block)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-3:]),
-                [
-                    self.model_tester.hidden_sizes[0],
-                    self.model_tester.image_size // 4,
-                    self.model_tester.image_size // 4,
-                ],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ in MODEL_MAPPING_NAMES.values():
-                continue
-
-            model = model_class(config)
-            model.set_train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            # loss.backward()
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "nvidia/segformer-b0-finetuned-ade-512-512"
-        model = SegformerModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-class SegformerModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_image_segmentation_ade(self):
-        # only resize + normalize
-        image_processor = SegformerImageProcessor(
-            image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
-        )
-        model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
-
-        image = prepare_img()
-        encoded_inputs = image_processor(images=image, return_tensors="ms")
-        pixel_values = encoded_inputs.pixel_values
-
-        outputs = model(pixel_values)
-
-        expected_shape = (1, model.config.num_labels, 128, 128)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [
-                [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]],
-                [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]],
-                [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]],
-            ]
-        )
-        print(outputs.logits[0, :3, :3, :3].asnumpy(), expected_slice.asnumpy())
-        self.assertTrue(np.allclose(outputs.logits[0, :3, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-3))
-
-    @slow
-    def test_inference_image_segmentation_city(self):
-        # only resize + normalize
-        image_processor = SegformerImageProcessor(
-            image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
-        )
-        model = SegformerForSemanticSegmentation.from_pretrained(
-            "nvidia/segformer-b1-finetuned-cityscapes-1024-1024"
-        )
-
-        image = prepare_img()
-        encoded_inputs = image_processor(images=image, return_tensors="ms")
-        pixel_values = encoded_inputs.pixel_values
-
-        outputs = model(pixel_values)
-
-        expected_shape = (1, model.config.num_labels, 128, 128)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [
-                [[-13.5748, -13.9111, -12.6500], [-14.3500, -15.3683, -14.2328], [-14.7532, -16.0424, -15.6087]],
-                [[-17.1651, -15.8725, -12.9653], [-17.2580, -17.3718, -14.8223], [-16.6058, -16.8783, -16.7452]],
-                [[-3.6456, -3.0209, -1.4203], [-3.0797, -3.1959, -2.0000], [-1.8757, -1.9217, -1.6997]],
-            ]
-        )
-        self.assertTrue(np.allclose(outputs.logits[0, :3, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-1))
-
-    @slow
-    def test_post_processing_semantic_segmentation(self):
-        # only resize + normalize
-        image_processor = SegformerImageProcessor(
-            image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
-        )
-        model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
-        image = prepare_img()
-        encoded_inputs = image_processor(images=image, return_tensors="ms")
-        pixel_values = encoded_inputs.pixel_values
-
-        outputs = model(pixel_values)
-
-        outputs.logits = outputs.logits
-
-        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[(500, 300)])
-        expected_shape = (500, 300)
-        self.assertEqual(segmentation[0].shape, expected_shape)
-
-        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
-        expected_shape = (128, 128)
-        self.assertEqual(segmentation[0].shape, expected_shape)
diff --git a/tests/transformers/models/seggpt/__init__.py b/tests/transformers/models/seggpt/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/seggpt/test_image_processing_seggpt.py b/tests/transformers/models/seggpt/test_image_processing_seggpt.py
deleted file mode 100644
index e1fa6528c..000000000
--- a/tests/transformers/models/seggpt/test_image_processing_seggpt.py
+++ /dev/null
@@ -1,356 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================
-"""Testing suite for the SegGpt model processing."""
-
-import unittest
-
-import numpy as np
-from datasets import load_dataset
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindspore import ops
-
-    from mindnlp.transformers.models.seggpt.modeling_seggpt import SegGptImageSegmentationOutput
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import SegGptImageProcessor
-
-
-class SegGptImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_image_processor_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.size["height"], self.size["width"]
-
-    def expected_post_processed_shape(self):
-        return self.size["height"] // 2, self.size["width"]
-
-    def get_fake_image_segmentation_output(self):
-        ms.set_seed(42)
-        return SegGptImageSegmentationOutput(
-            pred_masks=ops.rand(self.batch_size, self.num_channels,
-                                self.size["height"], self.size["width"])
-        )
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-def prepare_mask():
-    ds = load_dataset("EduardoPacheco/seggpt-example-data")["train"]
-    return ds[0]["mask"].convert("L")
-
-
-def prepare_img():
-    ds = load_dataset("EduardoPacheco/seggpt-example-data")["train"]
-    images = [image.convert("RGB") for image in ds["image"]]
-    masks = [image.convert("RGB") for image in ds["mask"]]
-    return images, masks
-
-
-@require_mindspore
-@require_vision
-class SegGptImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = SegGptImageProcessor
-
-    def setUp(self):
-        self.image_processor_tester = SegGptImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(
-            **self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42)
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
-
-    def test_image_processor_palette(self):
-        num_labels = 3
-        image_processing = self.image_processing_class(
-            **self.image_processor_dict)
-        palette = image_processing.get_palette(num_labels)
-        self.assertEqual(len(palette), num_labels + 1)
-        self.assertEqual(palette[0], (0, 0, 0))
-
-    def test_mask_equivalence(self):
-        image_processor = SegGptImageProcessor()
-
-        mask_binary = prepare_mask()
-        mask_rgb = mask_binary.convert("RGB")
-
-        inputs_binary = image_processor(
-            images=None, prompt_masks=mask_binary, return_tensors="ms")
-        inputs_rgb = image_processor(
-            images=None, prompt_masks=mask_rgb, return_tensors="ms", do_convert_rgb=False)
-
-        self.assertTrue(
-            (inputs_binary["prompt_masks"] == inputs_rgb["prompt_masks"]).all().item())
-
-    def test_mask_to_rgb(self):
-        image_processing = self.image_processing_class(
-            **self.image_processor_dict)
-        mask = prepare_mask()
-        mask = np.array(mask)
-        mask = (mask > 0).astype(np.uint8)
-
-        def check_two_colors(image, color1=(0, 0, 0), color2=(255, 255, 255)):
-            pixels = image.transpose(1, 2, 0).reshape(-1, 3)
-            unique_colors = np.unique(pixels, axis=0)
-            return bool(len(unique_colors) == 2 and (color1 in unique_colors) and (color2 in unique_colors))
-
-        num_labels = 1
-        palette = image_processing.get_palette(num_labels)
-
-        # Should only duplicate repeat class indices map, hence only (0,0,0) and (1,1,1)
-        mask_duplicated = image_processing.mask_to_rgb(mask)
-        # Mask using palette, since only 1 class is present we have colors (0,0,0) and (255,255,255)
-        mask_painted = image_processing.mask_to_rgb(mask, palette=palette)
-
-        self.assertTrue(check_two_colors(mask_duplicated, color2=(1, 1, 1)))
-        self.assertTrue(check_two_colors(mask_painted, color2=(255, 255, 255)))
-
-    def test_post_processing_semantic_segmentation(self):
-        image_processor = self.image_processing_class(
-            **self.image_processor_dict)
-        outputs = self.image_processor_tester.get_fake_image_segmentation_output()
-        post_processed = image_processor.post_process_semantic_segmentation(
-            outputs)
-
-        self.assertEqual(len(post_processed),
-                         self.image_processor_tester.batch_size)
-
-        expected_semantic_map_shape = self.image_processor_tester.expected_post_processed_shape()
-        self.assertEqual(post_processed[0].shape, expected_semantic_map_shape)
-
-    @slow
-    def test_pixel_values(self):
-        images, masks = prepare_img()
-        input_image = images[1]
-        prompt_image = images[0]
-        prompt_mask = masks[0]
-
-        image_processor = SegGptImageProcessor.from_pretrained(
-            "BAAI/seggpt-vit-large")
-
-        inputs = image_processor(
-            images=input_image,
-            prompt_images=prompt_image,
-            prompt_masks=prompt_mask,
-            return_tensors="ms",
-            do_convert_rgb=False,
-        )
-
-        # Verify pixel values
-        expected_prompt_pixel_values = ms.Tensor(
-            [
-                [[-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965],
-                    [-0.6965, -0.6965, -0.6965]],
-                [[1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583],
-                    [1.6583, 1.6583, 1.6583]],
-                [[2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088],
-                    [2.3088, 2.3088, 2.3088]],
-            ]
-        )
-
-        expected_pixel_values = ms.Tensor(
-            [
-                [[1.6324, 1.6153, 1.5810], [1.6153, 1.5982, 1.5810],
-                    [1.5810, 1.5639, 1.5639]],
-                [[1.2731, 1.2556, 1.2206], [1.2556, 1.2381, 1.2031],
-                    [1.2206, 1.2031, 1.1681]],
-                [[1.6465, 1.6465, 1.6465], [1.6465, 1.6465, 1.6465],
-                    [1.6291, 1.6291, 1.6291]],
-            ]
-        )
-
-        expected_prompt_masks = ms.Tensor(
-            [
-                [[-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179],
-                    [-2.1179, -2.1179, -2.1179]],
-                [[-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357],
-                    [-2.0357, -2.0357, -2.0357]],
-                [[-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044],
-                    [-1.8044, -1.8044, -1.8044]],
-            ]
-        )
-
-        self.assertTrue(np.allclose(inputs.pixel_values[0, :, :3, :3].asnumpy(
-        ), expected_pixel_values.asnumpy(), atol=1e-4))
-        self.assertTrue(
-            np.allclose(inputs.prompt_pixel_values[0, :, :3, :3].asnumpy(
-            ), expected_prompt_pixel_values.asnumpy(), atol=1e-4)
-        )
-        self.assertTrue(np.allclose(inputs.prompt_masks[0, :, :3, :3].asnumpy(
-        ), expected_prompt_masks.asnumpy(), atol=1e-4))
-
-    def test_prompt_mask_equivalence(self):
-        image_processor = self.image_processing_class(
-            **self.image_processor_dict)
-        image_size = self.image_processor_tester.image_size
-
-        # Single Mask Examples
-        expected_single_shape = [1, 3, image_size, image_size]
-
-        # Single Semantic Map (2D)
-        image_np_2d = np.ones((image_size, image_size))
-        image_pt_2d = ops.ones((image_size, image_size))
-        image_pil_2d = Image.fromarray(image_np_2d)
-
-        inputs_np_2d = image_processor(
-            images=None, prompt_masks=image_np_2d, return_tensors="ms")
-        inputs_pt_2d = image_processor(
-            images=None, prompt_masks=image_pt_2d, return_tensors="ms")
-        inputs_pil_2d = image_processor(
-            images=None, prompt_masks=image_pil_2d, return_tensors="ms")
-
-        self.assertTrue(
-            (inputs_np_2d["prompt_masks"] == inputs_pt_2d["prompt_masks"]).all().item())
-        self.assertTrue(
-            (inputs_np_2d["prompt_masks"] == inputs_pil_2d["prompt_masks"]).all().item())
-        self.assertEqual(
-            list(inputs_np_2d["prompt_masks"].shape), expected_single_shape)
-
-        # Single RGB Images (3D)
-        image_np_3d = np.ones((3, image_size, image_size))
-        image_pt_3d = ops.ones((3, image_size, image_size))
-        image_pil_3d = Image.fromarray(
-            image_np_3d.transpose(1, 2, 0).astype(np.uint8))
-
-        inputs_np_3d = image_processor(
-            images=None, prompt_masks=image_np_3d, return_tensors="ms", do_convert_rgb=False
-        )
-        inputs_pt_3d = image_processor(
-            images=None, prompt_masks=image_pt_3d, return_tensors="ms", do_convert_rgb=False
-        )
-        inputs_pil_3d = image_processor(
-            images=None, prompt_masks=image_pil_3d, return_tensors="ms", do_convert_rgb=False
-        )
-
-        self.assertTrue(
-            (inputs_np_3d["prompt_masks"] == inputs_pt_3d["prompt_masks"]).all().item())
-        self.assertTrue(
-            (inputs_np_3d["prompt_masks"] == inputs_pil_3d["prompt_masks"]).all().item())
-        self.assertEqual(
-            list(inputs_np_3d["prompt_masks"].shape), expected_single_shape)
-
-        # Batched Examples
-        expected_batched_shape = [2, 3, image_size, image_size]
-
-        # Batched Semantic Maps (3D)
-        image_np_2d_batched = np.ones((2, image_size, image_size))
-        image_pt_2d_batched = ops.ones((2, image_size, image_size))
-
-        inputs_np_2d_batched = image_processor(
-            images=None, prompt_masks=image_np_2d_batched, return_tensors="ms")
-        inputs_pt_2d_batched = image_processor(
-            images=None, prompt_masks=image_pt_2d_batched, return_tensors="ms")
-
-        self.assertTrue(
-            (inputs_np_2d_batched["prompt_masks"] == inputs_pt_2d_batched["prompt_masks"]).all().item())
-        self.assertEqual(
-            list(inputs_np_2d_batched["prompt_masks"].shape), expected_batched_shape)
-
-        # Batched RGB images
-        image_np_4d = np.ones((2, 3, image_size, image_size))
-        image_pt_4d = ops.ones((2, 3, image_size, image_size))
-
-        inputs_np_4d = image_processor(
-            images=None, prompt_masks=image_np_4d, return_tensors="ms", do_convert_rgb=False
-        )
-        inputs_pt_4d = image_processor(
-            images=None, prompt_masks=image_pt_4d, return_tensors="ms", do_convert_rgb=False
-        )
-
-        self.assertTrue(
-            (inputs_np_4d["prompt_masks"] == inputs_pt_4d["prompt_masks"]).all().item())
-        self.assertEqual(
-            list(inputs_np_4d["prompt_masks"].shape), expected_batched_shape)
-
-        # Comparing Single and Batched Examples
-        self.assertTrue(
-            (inputs_np_2d["prompt_masks"][0] == inputs_np_3d["prompt_masks"][0]).all().item())
-        self.assertTrue(
-            (inputs_np_2d_batched["prompt_masks"][0] == inputs_np_2d["prompt_masks"][0]).all().item())
-        self.assertTrue(
-            (inputs_np_2d_batched["prompt_masks"][0] == inputs_np_3d["prompt_masks"][0]).all().item())
-        self.assertTrue(
-            (inputs_np_2d_batched["prompt_masks"][0] == inputs_np_4d["prompt_masks"][0]).all().item())
-        self.assertTrue(
-            (inputs_np_2d_batched["prompt_masks"][0] == inputs_np_3d["prompt_masks"][0]).all().item())
diff --git a/tests/transformers/models/seggpt/test_modeling_seggpt.py b/tests/transformers/models/seggpt/test_modeling_seggpt.py
deleted file mode 100644
index 74e0cbcd0..000000000
--- a/tests/transformers/models/seggpt/test_modeling_seggpt.py
+++ /dev/null
@@ -1,463 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore SegGpt model."""
-
-import inspect
-import math
-import unittest
-
-from datasets import load_dataset
-
-from mindnlp.transformers import SegGptConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import SegGptForImageSegmentation, SegGptModel
-    from mindnlp.transformers.models.seggpt.modeling_seggpt import SegGptLoss
-
-
-if is_vision_available():
-    from mindnlp.transformers import SegGptImageProcessor
-
-
-class SegGptModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=False,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        initializer_range=0.02,
-        mlp_ratio=2.0,
-        merge_index=0,
-        intermediate_hidden_state_indices=[1],
-        pretrain_image_size=10,
-        decoder_hidden_size=10,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.mlp_ratio = mlp_ratio
-        self.merge_index = merge_index
-        self.intermediate_hidden_state_indices = intermediate_hidden_state_indices
-        self.pretrain_image_size = pretrain_image_size
-        self.decoder_hidden_size = decoder_hidden_size
-
-        # in SegGpt, the seq length equals the number of patches (we don't use the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size // 2, self.image_size])
-        prompt_pixel_values = floats_tensor(
-            [self.batch_size, self.num_channels, self.image_size // 2, self.image_size]
-        )
-        prompt_masks = floats_tensor([self.batch_size, self.num_channels, self.image_size // 2, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = floats_tensor([self.batch_size, self.num_channels, self.image_size // 2, self.image_size])
-
-        config = self.get_config()
-
-        return config, pixel_values, prompt_pixel_values, prompt_masks, labels
-
-    def get_config(self):
-        return SegGptConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            initializer_range=self.initializer_range,
-            mlp_ratio=self.mlp_ratio,
-            merge_index=self.merge_index,
-            intermediate_hidden_state_indices=self.intermediate_hidden_state_indices,
-            pretrain_image_size=self.pretrain_image_size,
-            decoder_hidden_size=self.decoder_hidden_size,
-        )
-
-    def create_and_check_model(self, config, pixel_values, prompt_pixel_values, prompt_masks, labels):
-        model = SegGptModel(config=config)
-        model.eval()
-        result = model(pixel_values, prompt_pixel_values, prompt_masks)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (
-                self.batch_size,
-                self.image_size // self.patch_size,
-                self.image_size // self.patch_size,
-                self.hidden_size,
-            ),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            prompt_pixel_values,
-            prompt_masks,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "prompt_pixel_values": prompt_pixel_values,
-            "prompt_masks": prompt_masks,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class SegGptModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as SegGpt does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (SegGptModel, SegGptForImageSegmentation) if is_mindspore_available() else ()
-    fx_compatible = False
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_torchscript = False
-    pipeline_model_mapping = (
-        {"feature-extraction": SegGptModel, "mask-generation": SegGptModel} if is_mindspore_available() else {}
-    )
-
-    def setUp(self):
-        self.model_tester = SegGptModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=SegGptConfig, has_text_modality=False)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="SegGpt does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values", "prompt_pixel_values", "prompt_masks"]
-            self.assertListEqual(arg_names[:3], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            patch_height = patch_width = config.image_size // config.patch_size
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-3:]),
-                [patch_height, patch_width, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_batching_equivalence(self):
-        def recursive_check(batched_object, single_row_object, model_name, key):
-            if isinstance(batched_object, (list, tuple)):
-                for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
-                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
-            else:
-                batched_row = batched_object[:1]
-                self.assertFalse(
-                    ops.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
-                )
-                self.assertTrue(
-                    ops.max(ops.abs(batched_row - single_row_object)) <= 1e-03,
-                    msg=(
-                        f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
-                        f"Difference={ops.max(ops.abs(batched_row - single_row_object))}."
-                    ),
-                )
-
-        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            config.output_hidden_states = True
-
-            model_name = model_class.__name__
-            batched_input_prepared = self._prepare_for_class(batched_input, model_class)
-            model = model_class(config).eval()
-
-            batch_size = self.model_tester.batch_size
-            single_row_input = {}
-            for key, value in batched_input_prepared.items():
-                if isinstance(value, mindspore.Tensor) and value.shape[0] % batch_size == 0:
-                    single_batch_shape = value.shape[0] // batch_size
-                    single_row_input[key] = value[:single_batch_shape]
-
-            with no_grad():
-                model_batched_output = model(**batched_input_prepared)
-                model_row_output = model(**single_row_input)
-
-            for key in model_batched_output:
-                # the first hidden state in SegGPT has weird hack of adding first half of batch with second half
-                if key == "hidden_states":
-                    model_batched_output[key] = model_batched_output[key][1:]
-                    model_row_output[key] = model_row_output[key][1:]
-                recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
-
-    def test_seggpt_loss(self):
-        mindspore.manual_seed(100)
-        mindspore.set_seed(100)
-        config = self.model_tester.get_config()
-
-        prompt_masks = ops.rand(1, config.num_channels, config.image_size, config.image_size)
-        label = ops.rand(1, config.num_channels, config.image_size, config.image_size)
-        pred_masks = ops.rand(1, config.num_channels, config.image_size * 2, config.image_size)
-        # seq_len x 2 because the loss concatenates prompt_masks and labels as pred_masks is concatenated
-        bool_masked_pos = ops.rand(1, self.model_tester.seq_length * 2) > 0.5
-
-        loss = SegGptLoss(config)
-        loss_value = loss(prompt_masks, pred_masks, label, bool_masked_pos)
-        print(loss_value)
-        expected_loss_value = mindspore.tensor(0.3267)
-
-        self.assertTrue(ops.allclose(loss_value, expected_loss_value, atol=1e-3))
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "BAAI/seggpt-vit-large"
-        model = SegGptModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-def prepare_img():
-    ds = load_dataset("EduardoPacheco/seggpt-example-data")["train"]
-    images = [image.convert("RGB") for image in ds["image"]]
-    masks = [image.convert("RGB") for image in ds["mask"]]
-    return images, masks
-
-
-def prepare_bool_masked_pos(config: SegGptConfig):
-    num_patches = math.prod([i // config.patch_size for i in config.image_size])
-    mask_ratio = 0.75
-    mindspore.manual_seed(2)
-    mindspore.set_seed(2)
-    num_masked_patches = int(num_patches * mask_ratio)
-    shuffle_idx = ops.randperm(num_patches)
-    bool_masked_pos = mindspore.Tensor([0] * (num_patches - num_masked_patches) + [1] * num_masked_patches)[
-        shuffle_idx
-    ]
-    bool_masked_pos = bool_masked_pos.unsqueeze(0).bool()
-
-    return bool_masked_pos
-
-
-@require_mindspore
-@require_vision
-class SegGptModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return SegGptImageProcessor.from_pretrained("BAAI/seggpt-vit-large") if is_vision_available() else None
-
-    @slow
-    def test_one_shot_inference(self):
-        model = SegGptForImageSegmentation.from_pretrained("BAAI/seggpt-vit-large")
-
-        image_processor = self.default_image_processor
-
-        images, masks = prepare_img()
-        input_image = images[1]
-        prompt_image = images[0]
-        prompt_mask = masks[0]
-
-        inputs = image_processor(
-            images=input_image,
-            prompt_images=prompt_image,
-            prompt_masks=prompt_mask,
-            return_tensors="ms",
-            do_convert_rgb=False,
-        )
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 3, 896, 448)
-        self.assertEqual(outputs.pred_masks.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [
-                [[-2.1208, -2.1190, -2.1198], [-2.1237, -2.1228, -2.1227], [-2.1232, -2.1226, -2.1228]],
-                [[-2.0405, -2.0396, -2.0403], [-2.0434, -2.0434, -2.0433], [-2.0428, -2.0432, -2.0434]],
-                [[-1.8102, -1.8088, -1.8099], [-1.8131, -1.8126, -1.8129], [-1.8130, -1.8128, -1.8131]],
-            ]
-        )
-
-        self.assertTrue(ops.allclose(outputs.pred_masks[0, :, :3, :3], expected_slice, atol=1e-4))
-
-        result = image_processor.post_process_semantic_segmentation(outputs, [input_image.size[::-1]])[0]
-
-        result_expected_shape = (170, 297)
-        expected_area = 1082
-        area = (result > 0).sum().item()
-        self.assertEqual(result.shape, result_expected_shape)
-        self.assertEqual(area, expected_area)
-
-    @slow
-    def test_few_shot_inference(self):
-        model = SegGptForImageSegmentation.from_pretrained("BAAI/seggpt-vit-large")
-        image_processor = self.default_image_processor
-
-        images, masks = prepare_img()
-        input_images = [images[1]] * 2
-        prompt_images = [images[0], images[2]]
-        prompt_masks = [masks[0], masks[2]]
-
-        inputs = image_processor(
-            images=input_images,
-            prompt_images=prompt_images,
-            prompt_masks=prompt_masks,
-            return_tensors="ms",
-            do_convert_rgb=False,
-        )
-
-        inputs = {k: v for k, v in inputs.items()}
-        with no_grad():
-            outputs = model(**inputs, feature_ensemble=True)
-
-        expected_shape = (2, 3, 896, 448)
-        expected_slice = mindspore.tensor(
-            [
-                [[-2.1201, -2.1192, -2.1189], [-2.1217, -2.1210, -2.1204], [-2.1216, -2.1202, -2.1194]],
-                [[-2.0393, -2.0390, -2.0387], [-2.0402, -2.0402, -2.0397], [-2.0400, -2.0394, -2.0388]],
-                [[-1.8083, -1.8076, -1.8077], [-1.8105, -1.8102, -1.8099], [-1.8105, -1.8095, -1.8090]],
-            ]
-        )
-
-        self.assertEqual(outputs.pred_masks.shape, expected_shape)
-        self.assertTrue(ops.allclose(outputs.pred_masks[0, :, 448:451, :3], expected_slice, atol=4e-4))
-
-    @slow
-    def test_one_shot_with_label(self):
-        model = SegGptForImageSegmentation.from_pretrained("BAAI/seggpt-vit-large")
-
-        image_processor = self.default_image_processor
-
-        images, masks = prepare_img()
-
-        input_image = images[1]
-        label = masks[1]
-        prompt_image = images[0]
-        prompt_mask = masks[0]
-
-        inputs = image_processor(
-            images=input_image,
-            prompt_masks=prompt_mask,
-            prompt_images=prompt_image,
-            return_tensors="ms",
-            do_convert_rgb=False,
-        )
-
-        labels = image_processor(images=None, prompt_masks=label, return_tensors="ms", do_convert_rgb=False)[
-            "prompt_masks"
-        ]
-
-        bool_masked_pos = prepare_bool_masked_pos(model.config)
-
-        with no_grad():
-            outputs = model(**inputs, labels=labels, bool_masked_pos=bool_masked_pos)
-
-        expected_loss = mindspore.tensor(0.0074)
-        self.assertTrue(ops.allclose(outputs.loss, expected_loss, atol=1e-4))
diff --git a/tests/transformers/models/sew/__init__.py b/tests/transformers/models/sew/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/sew/test_modeling_sew.py b/tests/transformers/models/sew/test_modeling_sew.py
deleted file mode 100644
index 5126d2cbe..000000000
--- a/tests/transformers/models/sew/test_modeling_sew.py
+++ /dev/null
@@ -1,597 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Testing suite for the MindSpore Hubert model."""
-
-import math
-import unittest
-
-import pytest
-
-import numpy as np
-
-from mindnlp.utils.testing_utils import (
-    require_soundfile,
-    is_mindspore_available,
-    slow,
-    require_mindspore,
-)
-
-from mindnlp.transformers import SEWConfig
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        SEWForCTC,
-        SEWForSequenceClassification,
-        SEWModel,
-        Wav2Vec2FeatureExtractor,
-        Wav2Vec2Processor,
-    )
-    from mindnlp.transformers.models.hubert.modeling_hubert import _compute_mask_indices
-
-
-class SEWModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=1024,  # speech is longer
-        is_training=False,
-        hidden_size=32,
-        feat_extract_norm="group",
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(64, 32, 32),
-        conv_stride=(5, 2, 1),
-        conv_kernel=(10, 3, 1),
-        conv_bias=False,
-        num_conv_pos_embeddings=31,
-        num_conv_pos_embedding_groups=2,
-        squeeze_factor=2,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        hidden_dropout=0.1,
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        vocab_size=32,
-        do_stable_layer_norm=False,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.squeeze_factor = squeeze_factor
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout = hidden_dropout
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.scope = scope
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length // self.squeeze_factor
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length], scale=1.0)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        return config, input_values, attention_mask
-
-    def get_config(self):
-        return SEWConfig(
-            hidden_size=self.hidden_size,
-            feat_extract_norm=self.feat_extract_norm,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            squeeze_factor=self.squeeze_factor,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout=self.hidden_dropout,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-        )
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = SEWModel(config=config)
-        model.set_train(False)
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.output_seq_length, self.hidden_size),
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        model = SEWModel(config=config)
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.bool_)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(
-            input_values, attention_mask=attention_mask
-        ).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(
-                np.allclose(output.asnumpy(), batch_output.asnumpy(), atol=1e-3)
-            )
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = SEWForCTC(config=config)
-
-        # make sure that dropout is disabled
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(
-            mindspore.tensor(input_lengths)
-        )
-        labels = ids_tensor(
-            (input_values.shape[0], int((min(max_length_labels) - 1).asnumpy())),
-            model.config.vocab_size,
-        )
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(
-            input_values, attention_mask=attention_mask, labels=labels
-        ).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(
-            input_values, attention_mask=attention_mask, labels=labels
-        ).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_ctc_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = SEWForCTC(config=config)
-        model.set_train()
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(
-            mindspore.tensor(input_lengths)
-        )
-        labels = ids_tensor(
-            (input_values.shape[0], int((max(max_length_labels) - 2).asnumpy())),
-            model.config.vocab_size,
-        )
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lengths are at least
-                # one shorter than logit lengths to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-    def check_seq_classifier_loss(self, config, input_values, *args):
-        model = SEWForSequenceClassification(config=config)
-
-        # make sure that dropout is disabled
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        masked_loss = model(
-            input_values, attention_mask=attention_mask, labels=labels
-        ).loss.item()
-        unmasked_loss = model(input_values, labels=labels).loss.item()
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-
-    def check_seq_classifier_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = SEWForSequenceClassification(config=config)
-        model.set_train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = SEWForCTC(config)
-        model.set_train()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(
-            mindspore.tensor(input_lengths)
-        )
-        labels = ids_tensor(
-            (input_values.shape[0], int((max(max_length_labels) - 2).asnumpy())),
-            model.config.vocab_size + 100,
-        )
-
-        with pytest.raises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class SEWModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (SEWForCTC, SEWModel, SEWForSequenceClassification)
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "audio-classification": SEWForSequenceClassification,
-            "automatic-speech-recognition": SEWForCTC,
-            "feature-extraction": SEWModel,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_headmasking = False
-
-    def setUp(self):
-        self.model_tester = SEWModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=SEWConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # Hubert has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_values`
-    def test_forward_signature(self):
-        pass
-
-    # SEW cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # SEW has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip("No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
-    @unittest.skip("mindnlp does not support `_attn_implementation` config.")
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                uniform_init_parms = [
-                    "conv.parametrizations.weight",
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "quantizer.weight_proj.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0
-                            <= ((param.data.mean() * 1e9).round() / 1e9).item()
-                            <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    @unittest.skip('CPU has precision problem')
-    def test_batching_equivalence(self):
-        pass
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if (
-            hasattr(module, "masked_spec_embed")
-            and module.masked_spec_embed is not None
-        ):
-            module.masked_spec_embed.data.fill_(3)
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = SEWModel.from_pretrained("asapp/sew-tiny-100k")
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class SEWUtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length
-        )
-        mask = mindspore.Tensor(mask)
-
-        self.assertListEqual(
-            mask.sum(axis=-1).tolist(),
-            [mask_prob * sequence_length for _ in range(batch_size)],
-        )
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length
-        )
-        mask = mindspore.Tensor(mask)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-
-@require_mindspore
-@require_soundfile
-@slow
-class SEWModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        from datasets import load_dataset
-
-        ds = load_dataset(
-            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
-        )
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_inference_pretrained_batched(self):
-        model = SEWModel.from_pretrained("asapp/sew-tiny-100k")
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("asapp/sew-tiny-100k")
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-
-        outputs = model(input_values).last_hidden_state
-
-        # expected outputs taken from the original SEW implementation
-        expected_outputs_first = mindspore.tensor(
-            [
-                [
-                    [0.1509, 0.5372, 0.3061, -0.1694],
-                    [-0.1700, 0.5764, 0.2753, -0.1299],
-                    [0.1281, 0.7949, 0.2342, -0.1624],
-                    [-0.1627, 0.6710, 0.2215, -0.1317],
-                ],
-                [
-                    [0.0408, 1.4355, 0.8605, -0.0968],
-                    [0.0393, 1.2368, 0.6826, 0.0364],
-                    [-0.1269, 1.9215, 1.1677, -0.1297],
-                    [-0.1654, 1.6524, 0.6877, -0.0196],
-                ],
-            ],
-        )
-        expected_outputs_last = mindspore.tensor(
-            [
-                [
-                    [1.3379, -0.1450, -0.1500, -0.0515],
-                    [0.8364, -0.1680, -0.1248, -0.0689],
-                    [1.2791, -0.1507, -0.1523, -0.0564],
-                    [0.8208, -0.1690, -0.1199, -0.0751],
-                ],
-                [
-                    [0.6959, -0.0861, -0.1235, -0.0861],
-                    [0.4700, -0.1686, -0.1141, -0.1199],
-                    [1.0776, -0.1137, -0.0124, -0.0472],
-                    [0.5774, -0.1675, -0.0376, -0.0823],
-                ],
-            ],
-        )
-        expected_output_sum = 62146.7422
-
-        self.assertTrue(
-            np.allclose(
-                outputs[:, :4, :4].asnumpy(),
-                expected_outputs_first.asnumpy(),
-                atol=5e-2,
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                outputs[:, -4:, -4:].asnumpy(),
-                expected_outputs_last.asnumpy(),
-                atol=5e-2,
-            )
-        )
-        self.assertTrue(abs(outputs.sum() - expected_output_sum) < 5)
-
-    def test_inference_ctc_batched(self):
-        model = SEWForCTC.from_pretrained("asapp/sew-tiny-100k-ft-ls100h")
-        processor = Wav2Vec2Processor.from_pretrained(
-            "asapp/sew-tiny-100k-ft-ls100h", do_lower_case=True
-        )
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-
-        logits = model(input_values).logits
-
-        predicted_ids = ops.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "swet covered brian's body trickling into the tightloine closs hat was the only garment he wore",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
diff --git a/tests/transformers/models/sew_d/__init__.py b/tests/transformers/models/sew_d/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/sew_d/test_modeling_sew_d.py b/tests/transformers/models/sew_d/test_modeling_sew_d.py
deleted file mode 100644
index cad019989..000000000
--- a/tests/transformers/models/sew_d/test_modeling_sew_d.py
+++ /dev/null
@@ -1,546 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Hubert model."""
-
-import math
-import unittest
-
-import pytest
-
-from mindnlp.transformers import SEWDConfig, is_mindspore_available
-from mindnlp.utils.testing_utils import require_soundfile, require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import (
-        SEWDForCTC,
-        SEWDForSequenceClassification,
-        SEWDModel,
-        Wav2Vec2FeatureExtractor,
-        Wav2Vec2Processor,
-    )
-    from mindnlp.transformers.models.hubert.modeling_hubert import _compute_mask_indices
-
-
-class SEWDModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=1024,  # speech is longer
-        is_training=False,
-        hidden_size=32,
-        feat_extract_norm="group",
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(64, 32, 32),
-        conv_stride=(5, 2, 1),
-        conv_kernel=(10, 3, 1),
-        conv_bias=False,
-        num_conv_pos_embeddings=31,
-        num_conv_pos_embedding_groups=2,
-        squeeze_factor=2,
-        max_position_embeddings=512,
-        position_buckets=256,
-        share_att_key=True,
-        relative_attention=True,
-        position_biased_input=False,
-        pos_att_type=("p2c", "c2p"),
-        norm_rel_ebd="layer_norm",
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        hidden_dropout=0.1,
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        vocab_size=32,
-        do_stable_layer_norm=False,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.squeeze_factor = squeeze_factor
-        self.max_position_embeddings = max_position_embeddings
-        self.position_buckets = position_buckets
-        self.share_att_key = share_att_key
-        self.relative_attention = relative_attention
-        self.position_biased_input = position_biased_input
-        self.pos_att_type = pos_att_type
-        self.norm_rel_ebd = norm_rel_ebd
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout = hidden_dropout
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.scope = scope
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length // self.squeeze_factor
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length], scale=1.0)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        return config, input_values, attention_mask
-
-    def get_config(self):
-        return SEWDConfig(
-            hidden_size=self.hidden_size,
-            feat_extract_norm=self.feat_extract_norm,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            squeeze_factor=self.squeeze_factor,
-            max_position_embeddings=self.max_position_embeddings,
-            position_buckets=self.position_buckets,
-            share_att_key=self.share_att_key,
-            relative_attention=self.relative_attention,
-            position_biased_input=self.position_biased_input,
-            pos_att_type=self.pos_att_type,
-            norm_rel_ebd=self.norm_rel_ebd,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout=self.hidden_dropout,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-        )
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = SEWDModel(config=config)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = SEWDModel(config=config)
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.bool_)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(ops.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = SEWDForCTC(config=config)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels).item() - 1), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_ctc_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = SEWDForCTC(config=config)
-        model.train()
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels).item() - 2), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lengths are at least
-                # one shorter than logit lengths to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        loss.backward()
-
-    def check_seq_classifier_loss(self, config, input_values, *args):
-        model = SEWDForSequenceClassification(config=config)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-        unmasked_loss = model(input_values, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-
-    def check_seq_classifier_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = SEWDForSequenceClassification(config=config)
-        model.train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = SEWDForCTC(config)
-        model.train()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels).item() - 2), model.config.vocab_size + 100)
-
-        with pytest.raises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class SEWDModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (SEWDForCTC, SEWDModel, SEWDForSequenceClassification) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "audio-classification": SEWDForSequenceClassification,
-            "automatic-speech-recognition": SEWDForCTC,
-            "feature-extraction": SEWDModel,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = SEWDModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=SEWDConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    @unittest.skip(reason="Model has no inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Model has input_values instead of input_ids")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip(reason="Model has no tokens embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Model has no inputs_embeds")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.parametrizations.weight",
-                    "conv.weight",
-                    "masked_spec_embed",
-                    "quantizer.weight_proj.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_checkpoints(self):
-        pass
-
-    @unittest.skip(reason="No support for low_cpu_mem_usage=True.")
-    def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = SEWDModel.from_pretrained("asapp/sew-d-tiny-100k")
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class SEWDUtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = ops.from_numpy(mask)
-
-        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = ops.from_numpy(mask)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-
-@require_mindspore
-@require_soundfile
-@slow
-class SEWDModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_inference_pretrained_batched(self):
-        model = SEWDModel.from_pretrained("asapp/sew-d-tiny-100k")
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("asapp/sew-d-tiny-100k")
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-
-        with no_grad():
-            outputs = model(input_values).last_hidden_state
-
-        # expected outputs taken from the original SEW-D implementation
-        expected_outputs_first = mindspore.tensor(
-            [
-                [
-                    [-0.1619, 0.6995, 0.4062, -0.1014],
-                    [-0.1364, 0.5960, 0.0952, -0.0873],
-                    [-0.1572, 0.5718, 0.4228, -0.0864],
-                    [-0.1325, 0.6823, 0.1387, -0.0871],
-                ],
-                [
-                    [-0.1296, 0.4008, 0.4952, -0.1450],
-                    [-0.1152, 0.3693, 0.3037, -0.1290],
-                    [-0.1194, 0.6074, 0.3531, -0.1466],
-                    [-0.1113, 0.3135, 0.2224, -0.1338],
-                ],
-            ],
-        )
-        expected_outputs_last = mindspore.tensor(
-            [
-                [
-                    [-0.1577, 0.5108, 0.8553, 0.2550],
-                    [-0.1530, 0.3580, 0.6143, 0.2672],
-                    [-0.1535, 0.4954, 0.8503, 0.1387],
-                    [-0.1572, 0.3363, 0.6217, 0.1490],
-                ],
-                [
-                    [-0.1338, 0.5459, 0.9607, -0.1133],
-                    [-0.1502, 0.3738, 0.7313, -0.0986],
-                    [-0.0953, 0.4708, 1.0821, -0.0944],
-                    [-0.1474, 0.3598, 0.7248, -0.0748],
-                ],
-            ],
-        )
-        expected_output_sum = 54201.0469
-
-        self.assertTrue(ops.allclose(outputs[:, :4, :4], expected_outputs_first, atol=1e-3))
-        self.assertTrue(ops.allclose(outputs[:, -4:, -4:], expected_outputs_last, atol=1e-3))
-        self.assertTrue(abs(outputs.sum() - expected_output_sum) < 1)
-
-    def test_inference_ctc_batched(self):
-        model = SEWDForCTC.from_pretrained("asapp/sew-d-tiny-100k-ft-ls100h")
-        processor = Wav2Vec2Processor.from_pretrained("asapp/sew-d-tiny-100k-ft-ls100h", do_lower_case=True)
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-
-        with no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = ops.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "swet covered breon's body trickling into the titlowing closs that was the only garmened he war",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
\ No newline at end of file
diff --git a/tests/transformers/models/speech_encoder_decoder/__init__.py b/tests/transformers/models/speech_encoder_decoder/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py b/tests/transformers/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
deleted file mode 100644
index f38136447..000000000
--- a/tests/transformers/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
+++ /dev/null
@@ -1,789 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import tempfile
-import unittest
-
-# from transformers import is_torch_available
-from mindnlp.utils.testing_utils import slow
-
-from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
-from ..bert.test_modeling_bert import BertModelTester
-from ..wav2vec2.test_modeling_wav2vec2 import Wav2Vec2ModelTester
-
-
-
-import numpy as np
-import mindspore
-import mindspore.common.dtype as mstype
-from mindspore import Tensor
-
-from mindnlp.core import ops
-from mindnlp.transformers import (
-    BertLMHeadModel,
-    SpeechEncoderDecoderConfig,
-    SpeechEncoderDecoderModel,
-    Wav2Vec2Model,
-)
-from mindnlp.transformers.modeling_outputs import BaseModelOutput
-from mindnlp.transformers.models.speech_to_text.modeling_speech_to_text import Speech2TextEncoder
-from mindnlp.transformers.models.speech_to_text.configuration_speech_to_text import Speech2TextConfig
-
-def prepare_speech_to_text_inputs_dict(
-    config,
-    input_features,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_features.ne(0)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(config.encoder_layers, config.encoder_attention_heads)
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    return {
-        # "input_ids": input_features,
-        "input_features": input_features,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-class Speech2TextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_labels=False,
-        vocab_size=99,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        num_conv_layers=2,
-        conv_kernel_sizes=(5, 5),
-        conv_channels=32,
-        input_feat_per_channel=24,
-        input_channels=1,
-        hidden_act="relu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        max_source_positions=20,
-        max_target_positions=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.num_conv_layers = num_conv_layers
-        self.conv_kernel_sizes = conv_kernel_sizes
-        self.conv_channels = conv_channels
-        self.input_feat_per_channel = input_feat_per_channel
-        self.input_channels = input_channels
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.max_source_positions = max_source_positions
-        self.max_target_positions = max_target_positions
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor(
-            [self.batch_size, self.seq_length, self.input_feat_per_channel], self.vocab_size
-        )
-        attention_mask = ops.ones([self.batch_size, self.seq_length], dtype=mstype.int64)
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(2)
-
-        config = self.get_config()
-        inputs_dict = prepare_speech_to_text_inputs_dict(
-            config,
-            input_features=input_features,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-        )
-        return config, inputs_dict
-
-    def get_config(self):
-        return Speech2TextConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            num_conv_layers=self.num_conv_layers,
-            conv_kernel_sizes=self.conv_kernel_sizes,
-            conv_channels=self.conv_channels,
-            input_feat_per_channel=self.input_feat_per_channel,
-            input_channels=self.input_channels,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            max_source_positions=self.max_source_positions,
-            max_target_positions=self.max_target_positions,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_subsampled_output_lengths(self, input_lengths):
-        """
-        Computes the output length of the convolutional layers
-        """
-
-        for i in range(self.num_conv_layers):
-            input_lengths = (input_lengths - 1) // 2 + 1
-
-        return input_lengths
-
-    def create_and_check_model_forward(self, config, inputs_dict):
-        model = Speech2TextModel(config=config).set_train(False)
-
-        input_features = inputs_dict["input_features"]
-        decoder_input_ids = inputs_dict["decoder_input_ids"]
-
-        # first forward pass
-        last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
-
-        self.parent.assertTrue(last_hidden_state.shape, (13, 7, 16))
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = Speech2TextModel(config=config).get_decoder().set_train(False)
-        input_ids = inputs_dict["decoder_input_ids"]
-        attention_mask = inputs_dict["decoder_attention_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size).clamp(2)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask], axis=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(Tensor(np.allclose(output_from_past_slice.as_numpy(), output_from_no_past_slice.as_numpy(), atol=1e-2)))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = Speech2TextModel(config=config).set_train(False)
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = Speech2TextEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(
-            inputs_dict["input_features"], attention_mask=inputs_dict["attention_mask"]
-        )[0]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = Speech2TextDecoder.from_pretrained(tmpdirname)
-
-        encoder_attention_mask = encoder._get_feature_vector_attention_mask(
-            encoder_last_hidden_state.shape[1], inputs_dict["attention_mask"]
-        )
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=encoder_attention_mask,
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-class EncoderDecoderMixin:
-    def get_encoder_decoder_model(self, config, decoder_config):
-        pass
-
-    def prepare_config_and_inputs(self):
-        pass
-
-    def get_pretrained_model_and_inputs(self):
-        pass
-
-    def check_encoder_decoder_model_from_pretrained_configs(
-        self,
-        config,
-        attention_mask,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        input_values=None,
-        input_features=None,
-        **kwargs,
-    ):
-        encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-        self.assertTrue(encoder_decoder_config.decoder.is_decoder)
-
-        enc_dec_model = SpeechEncoderDecoderModel(encoder_decoder_config)
-        enc_dec_model.set_train(False)
-
-        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
-        self.assertFalse(enc_dec_model.config.tie_word_embeddings)
-
-        outputs_encoder_decoder = enc_dec_model(
-            input_values=input_values,
-            input_features=input_features,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-
-    def check_encoder_decoder_model(
-        self,
-        config,
-        attention_mask,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        input_values=None,
-        input_features=None,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        self.assertTrue(enc_dec_model.config.decoder.is_decoder)
-        self.assertTrue(enc_dec_model.config.decoder.add_cross_attention)
-        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
-        enc_dec_model
-        outputs_encoder_decoder = enc_dec_model(
-            input_values=input_values,
-            input_features=input_features,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            output_hidden_states=True,
-        )
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        encoder_outputs = BaseModelOutput(last_hidden_state=outputs_encoder_decoder.encoder_hidden_states[-1])
-        outputs_encoder_decoder = enc_dec_model(
-            encoder_outputs=encoder_outputs,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-
-    def check_encoder_decoder_model_with_inputs(
-        self,
-        config,
-        attention_mask,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        input_values=None,
-        input_features=None,
-        **kwargs,
-    ):
-        inputs = input_values if input_features is None else input_features
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        enc_dec_model
-
-        outputs_encoder_decoder = enc_dec_model(
-            inputs,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            output_hidden_states=True,
-        )
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        outputs_encoder_decoder_kwarg = enc_dec_model(
-            inputs=inputs,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            output_hidden_states=True,
-        )
-        self.assertEqual(
-            outputs_encoder_decoder_kwarg["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-
-    def check_encoder_decoder_model_from_pretrained(
-        self,
-        config,
-        attention_mask,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        return_dict,
-        input_values=None,
-        input_features=None,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
-        enc_dec_model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-        enc_dec_model
-        outputs_encoder_decoder = enc_dec_model(
-            input_values=input_values,
-            input_features=input_features,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            output_hidden_states=True,
-            return_dict=True,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-
-    def check_save_and_load(
-        self,
-        config,
-        attention_mask,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        input_values=None,
-        input_features=None,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        enc_dec_model.set_train(False)
-        with mindspore._no_grad():
-            outputs = enc_dec_model(
-                input_values=input_values,
-                input_features=input_features,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-        out_2 = outputs[0].numpy()
-        out_2[np.isnan(out_2)] = 0
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            enc_dec_model.save_pretrained(tmpdirname)
-            enc_dec_model = SpeechEncoderDecoderModel.from_pretrained(tmpdirname)
-
-            after_outputs = enc_dec_model(
-                input_values=input_values,
-                input_features=input_features,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-            out_1 = after_outputs[0].numpy()
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-    def check_save_and_load_encoder_decoder_model(
-        self,
-        config,
-        attention_mask,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        input_values=None,
-        input_features=None,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        enc_dec_model.set_train(False)
-        with mindspore._no_grad():
-            outputs = enc_dec_model(
-                input_values=input_values,
-                input_features=input_features,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-        out_2 = outputs[0].numpy()
-        out_2[np.isnan(out_2)] = 0
-
-        with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:
-            enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname)
-            enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname)
-            SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
-                encoder_pretrained_model_name_or_path=encoder_tmp_dirname,
-                decoder_pretrained_model_name_or_path=decoder_tmp_dirname,
-            )
-
-            after_outputs = enc_dec_model(
-                input_values=input_values,
-                input_features=input_features,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-            out_1 = after_outputs[0].numpy()
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-    def check_encoder_decoder_model_output_attentions(
-        self,
-        config,
-        attention_mask,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        labels=None,
-        input_values=None,
-        input_features=None,
-        **kwargs,
-    ):
-        # make the decoder inputs a different shape from the encoder inputs to harden the test
-        decoder_input_ids = decoder_input_ids[:, :-1]
-        decoder_attention_mask = decoder_attention_mask[:, :-1]
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        outputs_encoder_decoder = enc_dec_model(
-            input_values=input_values,
-            input_features=input_features,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-            output_attentions=True,
-        )
-
-        inputs = input_values if input_features is None else input_features
-
-        encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
-        self.assertEqual(len(encoder_attentions), config.num_hidden_layers)
-
-        seq_len = enc_dec_model.encoder._get_feat_extract_output_lengths(inputs.shape[1])
-        self.assertEqual(encoder_attentions[0].shape[-3:], (config.num_attention_heads, seq_len, seq_len))
-
-        decoder_attentions = outputs_encoder_decoder["decoder_attentions"]
-        num_decoder_layers = (
-            decoder_config.num_decoder_layers
-            if hasattr(decoder_config, "num_decoder_layers")
-            else decoder_config.num_hidden_layers
-        )
-        self.assertEqual(len(decoder_attentions), num_decoder_layers)
-
-        self.assertEqual(
-            decoder_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
-        )
-
-        cross_attentions = outputs_encoder_decoder["cross_attentions"]
-        self.assertEqual(len(cross_attentions), num_decoder_layers)
-
-        cross_attention_input_seq_len = decoder_input_ids.shape[-1]
-        self.assertEqual(
-            cross_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, cross_attention_input_seq_len, seq_len),
-        )
-
-    def check_encoder_decoder_model_generate(
-        self, config, decoder_config, input_values=None, input_features=None, **kwargs
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-
-        # make sure EOS token is set to None to prevent early stopping of generation
-        if hasattr(enc_dec_model.config, "eos_token_id"):
-            enc_dec_model.config.eos_token_id = None
-        if hasattr(enc_dec_model.config, "decoder") and hasattr(enc_dec_model.config.decoder, "eos_token_id"):
-            enc_dec_model.config.decoder.eos_token_id = None
-        if hasattr(enc_dec_model.generation_config, "eos_token_id"):
-            enc_dec_model.generation_config.eos_token_id = None
-
-        inputs = input_values if input_features is None else input_features
-
-        # Bert does not have a bos token id, so use pad_token_id instead
-        generated_output = enc_dec_model.generate(
-            inputs, decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id
-        )
-        self.assertEqual(generated_output.shape, (inputs.shape[0],) + (decoder_config.max_length,))
-
-    def test_encoder_decoder_model(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model(**input_ids_dict)
-
-    def test_encoder_decoder_model_with_inputs(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_with_inputs(**input_ids_dict)
-
-    def test_encoder_decoder_model_from_pretrained_configs(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained_configs(**input_ids_dict)
-
-    def test_encoder_decoder_model_from_pretrained(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=False)
-
-    def test_encoder_decoder_model_from_pretrained_return_dict(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=True)
-
-    def test_save_and_load_from_pretrained(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_save_and_load(**input_ids_dict)
-
-    def test_save_and_load_from_encoder_decoder_pretrained(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_save_and_load_encoder_decoder_model(**input_ids_dict)
-
-    def test_encoder_decoder_model_output_attentions(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_output_attentions(**input_ids_dict)
-
-    def test_encoder_decoder_model_generate(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_generate(**input_ids_dict)
-
-    def test_training_gradient_checkpointing(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        encoder_model, decoder_model = self.get_encoder_decoder_model(
-            inputs_dict["config"], inputs_dict["decoder_config"]
-        )
-
-        model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        model.set_train()
-        # model.gradient_checkpointing_enable()
-        model.config.decoder_start_token_id = 0
-        model.config.pad_token_id = 0
-
-        model_inputs = {
-            "attention_mask": inputs_dict["attention_mask"],
-            "labels": inputs_dict["labels"],
-            "decoder_input_ids": inputs_dict["decoder_input_ids"],
-        }
-        inputs = inputs_dict["input_features"] if "input_features" in inputs_dict else inputs_dict["input_values"]
-
-        loss = model(inputs, **model_inputs).loss
-
-    @slow
-    def test_real_model_save_load_from_pretrained(self):
-        model_2, inputs = self.get_pretrained_model_and_inputs()
-
-        with mindspore._no_grad():
-            outputs = model_2(**inputs)
-            out_2 = outputs[0].numpy()
-            out_2[np.isnan(out_2)] = 0
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            model_2.save_pretrained(tmp_dirname)
-            model_1 = SpeechEncoderDecoderModel.from_pretrained(tmp_dirname)
-
-            after_outputs = model_1(**inputs)
-            out_1 = after_outputs[0].numpy()
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-
-class Wav2Vec2BertModelTest(EncoderDecoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
-            encoder_pretrained_model_name_or_path="facebook/wav2vec2-base-960h", decoder_pretrained_model_name_or_path="google-bert/bert-base-cased"
-        )
-        batch_size = 13
-        input_values = floats_tensor([batch_size, 512], scale=1.0)
-        attention_mask = random_attention_mask([batch_size, 512])
-        decoder_input_ids = ids_tensor([batch_size, 4], model.decoder.config.vocab_size)
-        decoder_attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {
-            "input_values": input_values,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-        return model, inputs
-
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = Wav2Vec2Model(config).set_train(False)
-        decoder_model = BertLMHeadModel(decoder_config).set_train(False)
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        bert_model_tester = BertModelTester(self)
-        wav2vec2_model_tester = Wav2Vec2ModelTester(self)
-        encoder_config_and_inputs = wav2vec2_model_tester.prepare_config_and_inputs()
-        decoder_config_and_inputs = bert_model_tester.prepare_config_and_inputs_for_decoder()
-        (
-            config,
-            input_values,
-            input_mask,
-        ) = encoder_config_and_inputs
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_token_type_ids,
-            decoder_input_mask,
-            decoder_sequence_labels,
-            decoder_token_labels,
-            decoder_choice_labels,
-            encoder_attention_mask,
-            _,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        return {
-            "config": config,
-            "input_values": input_values,
-            "attention_mask": input_mask,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_token_type_ids": decoder_token_type_ids,
-            "decoder_attention_mask": decoder_input_mask,
-            "decoder_sequence_labels": decoder_sequence_labels,
-            "decoder_token_labels": decoder_token_labels,
-            "decoder_choice_labels": decoder_choice_labels,
-            "labels": decoder_token_labels,
-        }
-
-
-class Speech2TextBertModelTest(EncoderDecoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "facebook/s2t-small-librispeech-asr", "google-bert/bert-base-cased"
-        )
-        batch_size = 13
-        input_features = floats_tensor([batch_size, 7, 80], scale=1.0)
-        attention_mask = random_attention_mask([batch_size, 7])
-        decoder_input_ids = ids_tensor([batch_size, 4], model.decoder.config.vocab_size)
-        decoder_attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {
-            "input_features": input_features,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-        return model, inputs
-
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = Speech2TextEncoder(config).set_train(False)
-        decoder_model = BertLMHeadModel(decoder_config).set_train(False)
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        bert_model_tester = BertModelTester(self)
-        speech2text_model_tester = Speech2TextModelTester(self)
-        encoder_config_and_inputs = speech2text_model_tester.prepare_config_and_inputs()
-        decoder_config_and_inputs = bert_model_tester.prepare_config_and_inputs_for_decoder()
-
-        config, inputs = encoder_config_and_inputs
-        input_features = inputs["input_features"]
-        input_mask = inputs["attention_mask"]
-
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_token_type_ids,
-            decoder_input_mask,
-            decoder_sequence_labels,
-            decoder_token_labels,
-            decoder_choice_labels,
-            encoder_attention_mask,
-            _,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        return {
-            "config": config,
-            "input_features": input_features,
-            "attention_mask": input_mask,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_token_type_ids": decoder_token_type_ids,
-            "decoder_attention_mask": decoder_input_mask,
-            "decoder_sequence_labels": decoder_sequence_labels,
-            "decoder_token_labels": decoder_token_labels,
-            "decoder_choice_labels": decoder_choice_labels,
-            "labels": decoder_token_labels,
-        }
-
-    # can't save full model for now because Speech2TextModel != Speech2TextEncoder
-    def test_encoder_decoder_model_from_pretrained_configs(self):
-        pass
-
-    # can't save full model for now because Speech2TextModel != Speech2TextEncoder
-    def test_save_and_load_from_pretrained(self):
-        pass
-
-    # all published pretrained models are Speech2TextModel != Speech2TextEncoder
-    def test_real_model_save_load_from_pretrained(self):
-        pass
diff --git a/tests/transformers/models/speech_to_text/__init__.py b/tests/transformers/models/speech_to_text/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/speech_to_text/test_modeling_speech_to_text.py b/tests/transformers/models/speech_to_text/test_modeling_speech_to_text.py
deleted file mode 100644
index 8ba2ac4c5..000000000
--- a/tests/transformers/models/speech_to_text/test_modeling_speech_to_text.py
+++ /dev/null
@@ -1,763 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Speech2Text model."""
-
-import copy
-import inspect
-import os
-import tempfile
-import unittest
-import numpy as np
-
-import pytest
-
-from mindnlp.transformers import Speech2TextConfig
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    require_sentencepiece,
-    require_tokenizers,
-    slow,
-
-)
-from mindnlp.utils import cached_property
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import Speech2TextForConditionalGeneration, Speech2TextModel, Speech2TextProcessor
-    from mindnlp.transformers.models.speech_to_text.modeling_speech_to_text import Speech2TextDecoder, \
-        Speech2TextEncoder
-
-
-def prepare_speech_to_text_inputs_dict(
-        config,
-        input_features,
-        decoder_input_ids,
-        attention_mask=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-):
-    if attention_mask is None:
-        attention_mask = input_features.ne(0)
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(config.encoder_layers, config.encoder_attention_heads)
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    return {
-        # "input_ids": input_features,
-        "input_features": input_features,
-        "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-class Speech2TextModelTester:
-    def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_labels=False,
-            vocab_size=99,
-            hidden_size=16,
-            num_hidden_layers=2,
-            num_attention_heads=4,
-            intermediate_size=4,
-            num_conv_layers=2,
-            conv_kernel_sizes=(5, 5),
-            conv_channels=32,
-            input_feat_per_channel=24,
-            input_channels=1,
-            hidden_act="relu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=20,
-            max_source_positions=20,
-            max_target_positions=20,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.num_conv_layers = num_conv_layers
-        self.conv_kernel_sizes = conv_kernel_sizes
-        self.conv_channels = conv_channels
-        self.input_feat_per_channel = input_feat_per_channel
-        self.input_channels = input_channels
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.max_source_positions = max_source_positions
-        self.max_target_positions = max_target_positions
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor(
-            [self.batch_size, self.seq_length, self.input_feat_per_channel], self.vocab_size
-        )
-        attention_mask = ops.ones([self.batch_size, self.seq_length], dtype=mindspore.int64)
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(2)
-
-        config = self.get_config()
-        inputs_dict = prepare_speech_to_text_inputs_dict(
-            config,
-            input_features=input_features,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-        )
-        return config, inputs_dict
-
-    def get_config(self):
-        return Speech2TextConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            num_conv_layers=self.num_conv_layers,
-            conv_kernel_sizes=self.conv_kernel_sizes,
-            conv_channels=self.conv_channels,
-            input_feat_per_channel=self.input_feat_per_channel,
-            input_channels=self.input_channels,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            max_source_positions=self.max_source_positions,
-            max_target_positions=self.max_target_positions,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_subsampled_output_lengths(self, input_lengths):
-        """
-        Computes the output length of the convolutional layers
-        """
-
-        for i in range(self.num_conv_layers):
-            input_lengths = (input_lengths - 1) // 2 + 1
-
-        return input_lengths
-
-    def create_and_check_model_forward(self, config, inputs_dict):
-        model = Speech2TextModel(config=config).eval()
-
-        input_features = inputs_dict["input_features"]
-        decoder_input_ids = inputs_dict["decoder_input_ids"]
-
-        # first forward pass
-        last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
-
-        self.parent.assertTrue(last_hidden_state.shape, (13, 7, 16))
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = Speech2TextModel(config=config).get_decoder().eval()
-        input_ids = inputs_dict["decoder_input_ids"]
-        attention_mask = inputs_dict["decoder_attention_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size).clamp(2)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = Speech2TextModel(config=config).eval()
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = Speech2TextEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(
-            inputs_dict["input_features"], attention_mask=inputs_dict["attention_mask"]
-        )[0]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = Speech2TextDecoder.from_pretrained(tmpdirname)
-
-        encoder_attention_mask = encoder._get_feature_vector_attention_mask(
-            encoder_last_hidden_state.shape[1], inputs_dict["attention_mask"]
-        )
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=encoder_attention_mask,
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (Speech2TextModel, Speech2TextForConditionalGeneration) if is_mindspore_available() else ()
-    all_generative_model_classes = (Speech2TextForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"automatic-speech-recognition": Speech2TextForConditionalGeneration, "feature-extraction": Speech2TextModel}
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    fx_compatible = True
-    test_pruning = False
-    test_missing_keys = False
-
-    input_name = "input_features"
-
-    def _get_input_ids_and_config(self, batch_size=2):
-        config, input_ids, attention_mask, inputs_dict = GenerationTesterMixin._get_input_ids_and_config(self)
-
-        # `input_ids` is actually `input_features` which is a 3D tensor.
-        # We must overwrite the mask to make it 2D since the original `_get_input_ids_and_config` creates an
-        # attention mask of the same shape as `input_ids`.
-        if len(attention_mask.shape) > 2:
-            sequence_length = input_ids.shape[1]
-            attention_mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-
-        return config, input_ids, attention_mask, inputs_dict
-
-    def setUp(self):
-        self.model_tester = Speech2TextModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Speech2TextConfig)
-        self.maxDiff = 3000
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_model_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_forward(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    @unittest.skip(reason="Not implemented currently")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Training is not supported yet")
-    def test_training(self):
-        pass
-
-    @unittest.skip
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_features = input_dict["input_features"]
-        attention_mask = input_dict["attention_mask"]
-        model = Speech2TextForConditionalGeneration(config).eval()
-        input_features = input_features.half()
-        model.half()
-        model.generate(input_features, attention_mask=attention_mask)
-        model.generate(input_features, num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_features",
-                "attention_mask",
-                "decoder_input_ids",
-                "decoder_attention_mask",
-            ]
-            expected_arg_names.extend(
-                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                else ["encoder_outputs"]
-            )
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-
-            model.eval()
-
-            with mindspore._no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            subsampled_seq_length = model._get_feat_extract_output_lengths(seq_length)
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [subsampled_seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-
-            model.eval()
-
-            subsampled_encoder_seq_length = model._get_feat_extract_output_lengths(encoder_seq_length)
-            subsampled_encoder_key_length = model._get_feat_extract_output_lengths(encoder_key_length)
-
-            with mindspore._no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with mindspore._no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 5
-
-            # loss is at first position
-            if "labels" in inputs_dict:
-                correct_outlen += 1  # loss is added to beginning
-            if "past_key_values" in outputs:
-                correct_outlen += 1  # past_key_values have been returned
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    decoder_seq_length,
-                    subsampled_encoder_key_length,
-                ],
-            )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with mindspore._no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            added_hidden_states = 2
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
-            )
-
-    def test_resize_tokens_embeddings(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            self.skipTest(reason="test_resize_embeddings is set to False")
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            if self.model_tester.is_training is False:
-                model.eval()
-
-            model_vocab_size = config.vocab_size
-            # Retrieve the embeddings and clone theme
-            model_embed = model.resize_token_embeddings(model_vocab_size)
-            cloned_embeddings = model_embed.weight.clone()
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
-
-            # make sure that decoder_input_ids are resized
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = inputs_dict["decoder_input_ids"].clamp(max=model_vocab_size - 15 - 1)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                if p1.ne(p2).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-    def test_resize_embeddings_untied(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            self.skipTest(reason="test_resize_embeddings is set to False")
-
-        original_config.tie_word_embeddings = False
-
-        # if model cannot untied embeddings -> leave test
-        if original_config.tie_word_embeddings:
-            self.skipTest(reason="Model cannot untie embeddings")
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            # if no output embeddings -> leave test
-            if model.get_output_embeddings() is None:
-                continue
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_vocab_size = config.vocab_size
-            model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            output_embeds = model.get_output_embeddings()
-            print(output_embeds)
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            if "decoder_input_ids" in inputs_dict:
-                # print("clamp before:", inputs_dict["decoder_input_ids"])
-                # print(model_vocab_size - 15 - 1)
-                inputs_dict["decoder_input_ids"] = inputs_dict["decoder_input_ids"].clamp(max=model_vocab_size - 15 - 1)
-                # print("clamp after:", inputs_dict["decoder_input_ids"])
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-    @unittest.skip
-    def test_generate_without_input_ids(self):
-        pass
-
-    @staticmethod
-    def _get_encoder_outputs(
-            model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
-    ):
-        encoder = model.get_encoder()
-        encoder_outputs = encoder(
-            input_ids,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-        )
-        encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave(
-            num_interleave, dim=0
-        )
-        input_ids = input_ids[:, :, 0]
-        generation_config = copy.deepcopy(model.generation_config)
-        model._prepare_special_tokens(generation_config)
-        input_ids = ops.zeros_like(input_ids[:, :1]) + generation_config.decoder_start_token_id
-        attention_mask = None
-        return encoder_outputs, input_ids, attention_mask
-
-    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
-        batch_size, seq_length = input_ids.shape[:2]
-        subsampled_seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
-        num_sequences_in_output = batch_size * num_return_sequences
-        gen_len = (
-            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
-        )
-
-        # scores
-        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
-
-        # Attentions
-        # encoder
-        self._check_encoder_attention_for_generate(
-            output.encoder_attentions, batch_size, config, subsampled_seq_length
-        )
-        # decoder
-        self._check_attentions_for_generate(
-            num_sequences_in_output,
-            output.decoder_attentions,
-            min_length=1,
-            max_length=output.sequences.shape[-1],
-            config=config,
-            use_cache=use_cache,
-        )
-
-        # Hidden States
-        # encoder
-        self._check_encoder_hidden_states_for_generate(
-            output.encoder_hidden_states, batch_size, config, subsampled_seq_length
-        )
-
-        # decoder
-        self._check_hidden_states_for_generate(
-            num_sequences_in_output,
-            output.decoder_hidden_states,
-            min_length=1,
-            max_length=output.sequences.shape[-1],
-            config=config,
-            use_cache=use_cache,
-        )
-
-    @unittest.skip(reason="Test failing,  @RocketNight is looking into it")
-    def test_tf_from_pt_safetensors(self):
-        pass
-
-
-@require_sentencepiece
-@require_tokenizers
-@slow
-class Speech2TextModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_processor(self):
-        return Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
-
-    def _load_datasamples(self, num_samples):
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_generation_librispeech(self):
-        model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr",from_pt=True)
-
-        processor = self.default_processor
-
-        input_speech = self._load_datasamples(1)
-
-        input_features = processor(input_speech, return_tensors="ms").input_features
-
-        generated_ids = model.generate(input_features)
-
-        generated_transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"
-        ]
-        self.assertListEqual(generated_transcript, EXPECTED_TRANSCRIPTIONS)
-
-    def test_generation_librispeech_batched(self):
-        model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr",from_pt=True)
-        processor = self.default_processor
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True)
-
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-
-        generated_ids = model.generate(input_features, attention_mask=attention_mask)
-        generated_transcripts = processor.batch_decode(generated_ids, skip_special_tokens=True)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
-            "nor is mister cultar's manner less interesting than his matter",
-            "he tells us that at this festive season of the year with christmas and roast beef looming before us"
-            " similes drawn from eating and its results occur most readily to the mind",
-            "he has grave doubts whether sir frederick leyton's work is really greek after all and can discover in it"
-            " but little of rocky ithaca",
-        ]
-
-        self.assertListEqual(generated_transcripts, EXPECTED_TRANSCRIPTIONS)
\ No newline at end of file
diff --git a/tests/transformers/models/speech_to_text/test_tokenization_speech_to_text.py b/tests/transformers/models/speech_to_text/test_tokenization_speech_to_text.py
deleted file mode 100644
index 0462adf1b..000000000
--- a/tests/transformers/models/speech_to_text/test_tokenization_speech_to_text.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from pathlib import Path
-from shutil import copyfile
-
-from mindnlp.transformers import SPIECE_UNDERLINE
-from mindnlp.transformers.models.speech_to_text.tokenization_speech_to_text import Speech2TextTokenizer
-from mindnlp.transformers.models.speech_to_text.tokenization_speech_to_text import VOCAB_FILES_NAMES, save_json
-from mindnlp.utils.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
-
-import sentencepiece as sp
-
-FR_CODE = 5
-ES_CODE = 10
-
-
-@require_sentencepiece
-@require_tokenizers
-class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "facebook/s2t-small-librispeech-asr"
-    tokenizer_class = Speech2TextTokenizer
-    test_rust_tokenizer = False
-    test_sentencepiece = True
-
-    def setUp(self):
-        super().setUp()
-
-        spm_model = sp.SentencePieceProcessor()
-        spm_model.Load(SAMPLE_VOCAB)
-        vocab = ["<s>", "<pad>", "</s>", "<unk>"]
-
-        vocab += [spm_model.IdToPiece(id_) for id_ in range(len(spm_model))]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        save_dir = Path(self.tmpdirname)
-        save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
-        if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
-            copyfile(SAMPLE_VOCAB, save_dir / VOCAB_FILES_NAMES["spm_file"])
-
-        tokenizer = Speech2TextTokenizer.from_pretrained(self.tmpdirname)
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = "<pad>"
-        token_id = 1
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "<s>")
-        self.assertEqual(vocab_keys[1], "<pad>")
-        self.assertEqual(vocab_keys[-1], "j")
-        self.assertEqual(len(vocab_keys), 1_001)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 1_001)
-
-    def test_full_tokenizer(self):
-        tokenizer = Speech2TextTokenizer.from_pretrained(self.tmpdirname)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
-
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens),
-            [289, 50, 14, 174, 386],
-        )
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens,
-                             [SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n",
-                              SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",",
-                              SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is",
-                              SPIECE_UNDERLINE + "f", "al", "s", "é", "."])  # fmt: skip
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(ids, [12, 25, 88, 59, 28, 23, 11, 4, 606, 351, 351, 351, 7, 16, 70, 50, 76, 84, 10, 4, 8])
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens,
-                             [SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n",
-                              SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "<unk>", "2", "0", "0", "0", ",",
-                              SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is",
-                              SPIECE_UNDERLINE + "f", "al", "s", "<unk>", "."])  # fmt: skip
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [
-            [3791, 797, 31, 11, 64, 797, 31, 2429, 433, 12, 1176, 12, 20, 786, 915, 142, 2413, 240, 37, 3238, 797, 31,
-             11, 35, 93, 915, 142, 2413, 240, 37, 5540, 567, 1276, 93, 37, 610, 40, 62, 455, 657, 1042, 123, 780, 177,
-             37, 309, 241, 1298, 514, 20, 292, 2737, 114, 2469, 241, 85, 64, 302, 548, 528, 423, 4, 509, 406, 423, 37,
-             601, 4, 777, 302, 548, 528, 423, 284, 4, 3388, 511, 459, 4, 3555, 40, 321, 302, 705, 4, 3388, 511, 583,
-             326, 5, 5, 5, 62, 3310, 560, 177, 2680, 217, 1508, 32, 31, 853, 418, 64, 583, 511, 1605, 62, 35, 93, 560,
-             177, 2680, 217, 1508, 1521, 64, 583, 511, 519, 62, 20, 1515, 764, 20, 149, 261, 5625, 7972, 20, 5540, 567,
-             1276, 93, 3925, 1675, 11, 15, 802, 7972, 576, 217, 1508, 11, 35, 93, 1253, 2441, 15, 289, 652, 31, 416,
-             321, 3842, 115, 40, 911, 8, 476, 619, 4, 380, 142, 423, 335, 240, 35, 93, 264, 8, 11, 335, 569, 420, 163,
-             5, 2],
-            [260, 548, 528, 423, 20, 451, 20, 2681, 1153, 3434, 20, 5540, 37, 567, 126, 1253, 2441, 3376, 449, 210, 431,
-             1563, 177, 767, 5540, 11, 1203, 472, 11, 2953, 685, 285, 364, 706, 1153, 20, 6799, 20, 2869, 20, 4464, 126,
-             40, 2429, 20, 1040, 866, 2664, 418, 20, 318, 20, 1726, 186, 20, 265, 522, 35, 93, 2191, 4634, 20, 1040, 12,
-             6799, 15, 228, 2356, 142, 31, 11, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-            [2575, 2666, 684, 1582, 1176, 12, 627, 149, 619, 20, 4902, 563, 11, 20, 149, 261, 3420, 2356, 174, 142,
-             4714, 131, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-             1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-             0]]}  # fmt: skip
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="facebook/s2t-small-mustc-en-de-st",
-            revision="a14f04cf0776c02f62a8cb800cf7909e15ea23ad",
-        )
-
-
-@require_sentencepiece
-class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
-    checkpoint_name = "valhalla/s2t_mustc_multilinguial_medium"
-
-    french_text = "C'est trop cool"
-    spanish_text = "Esto es genial"
-
-    @classmethod
-    def setUpClass(cls):
-        cls.tokenizer: Speech2TextTokenizer = Speech2TextTokenizer.from_pretrained(cls.checkpoint_name)
-        return cls
-
-    def check_language_codes(self):
-        self.assertEqual(self.tokenizer.lang_code_to_id["pt"], 4)
-        self.assertEqual(self.tokenizer.lang_code_to_id["ru"], 6)
-        self.assertEqual(self.tokenizer.lang_code_to_id["it"], 9)
-        self.assertEqual(self.tokenizer.lang_code_to_id["de"], 11)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.tokenizer.vocab_size, 10_000)
-
-    def test_tokenizer_decode_ignores_language_codes(self):
-        self.assertIn(ES_CODE, self.tokenizer.all_special_ids)
-        generated_ids = [ES_CODE, 4, 1601, 47, 7647, 2]
-        result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
-        expected_spanish = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
-        self.assertEqual(result, expected_spanish)
-        self.assertNotIn(self.tokenizer.eos_token, result)
-
-    def test_tokenizer_adds_special_tokens(self):
-        self.tokenizer.tgt_lang = "fr"
-        encoded = self.tokenizer(self.french_text).input_ids
-        self.assertEqual(encoded[0], FR_CODE)
-        self.assertEqual(encoded[-1], self.tokenizer.eos_token_id)
-
-    def test_tgt_lang_setter(self):
-        self.tokenizer.tgt_lang = "fr"
-        self.assertListEqual(self.tokenizer.prefix_tokens, [FR_CODE])
-
-        self.tokenizer.tgt_lang = "es"
-        self.assertListEqual(self.tokenizer.prefix_tokens, [ES_CODE])
diff --git a/tests/transformers/models/speecht5/__init__.py b/tests/transformers/models/speecht5/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/speecht5/test_feature_extraction_speecht5.py b/tests/transformers/models/speecht5/test_feature_extraction_speecht5.py
deleted file mode 100644
index 015361c54..000000000
--- a/tests/transformers/models/speecht5/test_feature_extraction_speecht5.py
+++ /dev/null
@@ -1,422 +0,0 @@
-# coding=utf-8
-# Copyright 2021-2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for the SpeechT5 feature extractors."""
-
-import itertools
-import random
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import SpeechT5FeatureExtractor
-from mindnlp.transformers.feature_extraction_utils import BatchFeature
-from mindnlp.utils.testing_utils import require_mindspore
-from mindnlp.utils import is_mindspore_available
-
-from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
-
-
-if is_mindspore_available():
-    import mindspore
-
-
-global_rng = random.Random()
-
-
-# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-
-# @require_mindspore
-# class SpeechT5FeatureExtractionTester(unittest.TestCase):
-#     def __init__(
-#         self,
-#         parent,
-#         batch_size=7,
-#         min_seq_length=400,
-#         max_seq_length=2000,
-#         feature_size=1,
-#         padding_value=0.0,
-#         sampling_rate=16000,
-#         do_normalize=True,
-#         num_mel_bins=80,
-#         hop_length=16,
-#         win_length=64,
-#         win_function="hann_window",
-#         fmin=80,
-#         fmax=7600,
-#         mel_floor=1e-10,
-#         return_attention_mask=True,
-#     ):
-#         self.parent = parent
-#         self.batch_size = batch_size
-#         self.min_seq_length = min_seq_length
-#         self.max_seq_length = max_seq_length
-#         self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
-#         self.feature_size = feature_size
-#         self.padding_value = padding_value
-#         self.sampling_rate = sampling_rate
-#         self.do_normalize = do_normalize
-#         self.num_mel_bins = num_mel_bins
-#         self.hop_length = hop_length
-#         self.win_length = win_length
-#         self.win_function = win_function
-#         self.fmin = fmin
-#         self.fmax = fmax
-#         self.mel_floor = mel_floor
-#         self.return_attention_mask = return_attention_mask
-
-#     def prepare_feat_extract_dict(self):
-#         return {
-#             "feature_size": self.feature_size,
-#             "padding_value": self.padding_value,
-#             "sampling_rate": self.sampling_rate,
-#             "do_normalize": self.do_normalize,
-#             "num_mel_bins": self.num_mel_bins,
-#             "hop_length": self.hop_length,
-#             "win_length": self.win_length,
-#             "win_function": self.win_function,
-#             "fmin": self.fmin,
-#             "fmax": self.fmax,
-#             "mel_floor": self.mel_floor,
-#             "return_attention_mask": self.return_attention_mask,
-#         }
-
-#     def prepare_inputs_for_common(self, equal_length=False, numpify=False):
-#         def _flatten(list_of_lists):
-#             return list(itertools.chain(*list_of_lists))
-
-#         if equal_length:
-#             speech_inputs = floats_list((self.batch_size, self.max_seq_length))
-#         else:
-#             # make sure that inputs increase in size
-#             speech_inputs = [
-#                 _flatten(floats_list((x, self.feature_size)))
-#                 for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
-#             ]
-
-#         if numpify:
-#             speech_inputs = [np.asarray(x) for x in speech_inputs]
-
-#         return speech_inputs
-
-#     def prepare_inputs_for_target(self, equal_length=False, numpify=False):
-#         if equal_length:
-#             speech_inputs = [floats_list((self.max_seq_length, self.num_mel_bins)) for _ in range(self.batch_size)]
-#         else:
-#             # make sure that inputs increase in size
-#             speech_inputs = [
-#                 floats_list((x, self.num_mel_bins))
-#                 for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
-#             ]
-
-#         if numpify:
-#             speech_inputs = [np.asarray(x) for x in speech_inputs]
-
-#         return speech_inputs
-
-
-# @require_mindspore
-# class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-#     feature_extraction_class = SpeechT5FeatureExtractor
-
-#     def setUp(self):
-#         self.feat_extract_tester = SpeechT5FeatureExtractionTester(self)
-
-#     def _check_zero_mean_unit_variance(self, input_vector):
-#         self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3))
-#         self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3))
-
-#     def test_call(self):
-#         # Tests that all call wrap to encode_plus and batch_encode_plus
-#         feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-#         # create three inputs of length 800, 1000, and 1200
-#         speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-#         np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-#         # Test not batched input
-#         encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values
-#         encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values
-#         self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
-
-#         # Test batched
-#         encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
-#         encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
-#         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-#             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-#     def test_zero_mean_unit_variance_normalization_np(self):
-#         feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-#         speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-
-#         paddings = ["longest", "max_length", "do_not_pad"]
-#         max_lengths = [None, 1600, None]
-#         for max_length, padding in zip(max_lengths, paddings):
-#             processed = feat_extract(speech_inputs, padding=padding, max_length=max_length, return_tensors="np")
-#             input_values = processed.input_values
-
-#             self._check_zero_mean_unit_variance(input_values[0][:800])
-#             self.assertTrue(input_values[0][800:].sum() < 1e-6)
-#             self._check_zero_mean_unit_variance(input_values[1][:1000])
-#             self.assertTrue(input_values[0][1000:].sum() < 1e-6)
-#             self._check_zero_mean_unit_variance(input_values[2][:1200])
-
-#     def test_zero_mean_unit_variance_normalization(self):
-#         feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-#         lengths = range(800, 1400, 200)
-#         speech_inputs = [floats_list((1, x))[0] for x in lengths]
-
-#         paddings = ["longest", "max_length", "do_not_pad"]
-#         max_lengths = [None, 1600, None]
-
-#         for max_length, padding in zip(max_lengths, paddings):
-#             processed = feat_extract(speech_inputs, max_length=max_length, padding=padding)
-#             input_values = processed.input_values
-
-#             self._check_zero_mean_unit_variance(input_values[0][:800])
-#             self._check_zero_mean_unit_variance(input_values[1][:1000])
-#             self._check_zero_mean_unit_variance(input_values[2][:1200])
-
-#     def test_zero_mean_unit_variance_normalization_trunc_np_max_length(self):
-#         feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-#         speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-#         processed = feat_extract(
-#             speech_inputs, truncation=True, max_length=1000, padding="max_length", return_tensors="np"
-#         )
-#         input_values = processed.input_values
-
-#         self._check_zero_mean_unit_variance(input_values[0, :800])
-#         self._check_zero_mean_unit_variance(input_values[1])
-#         self._check_zero_mean_unit_variance(input_values[2])
-
-#     def test_zero_mean_unit_variance_normalization_trunc_np_longest(self):
-#         feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-#         speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-#         processed = feat_extract(
-#             speech_inputs, truncation=True, max_length=1000, padding="longest", return_tensors="np"
-#         )
-#         input_values = processed.input_values
-
-#         self._check_zero_mean_unit_variance(input_values[0, :800])
-#         self._check_zero_mean_unit_variance(input_values[1, :1000])
-#         self._check_zero_mean_unit_variance(input_values[2])
-
-#         # make sure that if max_length < longest -> then pad to max_length
-#         self.assertTrue(input_values.shape == (3, 1000))
-
-#         speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-#         processed = feat_extract(
-#             speech_inputs, truncation=True, max_length=2000, padding="longest", return_tensors="np"
-#         )
-#         input_values = processed.input_values
-
-#         self._check_zero_mean_unit_variance(input_values[0, :800])
-#         self._check_zero_mean_unit_variance(input_values[1, :1000])
-#         self._check_zero_mean_unit_variance(input_values[2])
-
-#         # make sure that if max_length > longest -> then pad to longest
-#         self.assertTrue(input_values.shape == (3, 1200))
-
-#     def test_double_precision_pad(self):
-#         feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-#         np_speech_inputs = np.random.rand(100).astype(np.float64)
-#         py_speech_inputs = np_speech_inputs.tolist()
-
-#         for inputs in [py_speech_inputs, np_speech_inputs]:
-#             np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np")
-#             self.assertTrue(np_processed.input_values.dtype == np.float32)
-#             pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="ms")
-#             self.assertTrue(pt_processed.input_values.dtype == mindspore.float32)
-
-#     def test_call_target(self):
-#         # Tests that all call wrap to encode_plus and batch_encode_plus
-#         feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-#         # create three inputs of length 800, 1000, and 1200
-#         speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-#         np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-#         # Test feature size
-#         input_values = feature_extractor(audio_target=np_speech_inputs, padding=True, return_tensors="np").input_values
-#         self.assertTrue(input_values.ndim == 3)
-#         self.assertTrue(input_values.shape[-1] == feature_extractor.num_mel_bins)
-
-#         # Test not batched input
-#         encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_values
-#         encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_values
-#         self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
-
-#         # Test batched
-#         encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_values
-#         encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_values
-#         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-#             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-#         # Test 2-D numpy arrays are batched.
-#         speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
-#         np_speech_inputs = np.asarray(speech_inputs)
-#         encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_values
-#         encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_values
-#         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-#             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-#     def test_batch_feature_target(self):
-#         speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
-#         feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
-#         input_name = feat_extract.model_input_names[0]
-
-#         processed_features = BatchFeature({input_name: speech_inputs})
-
-#         self.assertTrue(all(len(x) == len(y) for x, y in zip(speech_inputs, processed_features[input_name])))
-
-#         speech_inputs = self.feat_extract_tester.prepare_inputs_for_target(equal_length=True)
-#         processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="np")
-
-#         batch_features_input = processed_features[input_name]
-
-#         if len(batch_features_input.shape) < 3:
-#             batch_features_input = batch_features_input[:, :, None]
-
-#         self.assertTrue(
-#             batch_features_input.shape
-#             == (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.num_mel_bins)
-#         )
-
-#     @require_mindspore
-#     def test_batch_feature_target_pt(self):
-#         speech_inputs = self.feat_extract_tester.prepare_inputs_for_target(equal_length=True)
-#         feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
-#         input_name = feat_extract.model_input_names[0]
-
-#         processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="ms")
-
-#         batch_features_input = processed_features[input_name]
-
-#         if len(batch_features_input.shape) < 3:
-#             batch_features_input = batch_features_input[:, :, None]
-
-#         self.assertTrue(
-#             batch_features_input.shape
-#             == (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.num_mel_bins)
-#         )
-
-#     @require_mindspore
-#     def test_padding_accepts_tensors_target_pt(self):
-#         feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
-#         speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
-#         input_name = feat_extract.model_input_names[0]
-
-#         processed_features = BatchFeature({input_name: speech_inputs})
-
-#         feat_extract.feature_size = feat_extract.num_mel_bins  # hack!
-
-#         input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
-#         input_pt = feat_extract.pad(processed_features, padding="longest", return_tensors="ms")[input_name]
-
-#         self.assertTrue(abs(input_np.astype(np.float32).sum() - input_pt.numpy().astype(np.float32).sum()) < 1e-2)
-
-#     def test_attention_mask_target(self):
-#         feat_dict = self.feat_extract_dict
-#         feat_dict["return_attention_mask"] = True
-#         feat_extract = self.feature_extraction_class(**feat_dict)
-#         speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
-#         input_lengths = [len(x) for x in speech_inputs]
-#         input_name = feat_extract.model_input_names[0]
-
-#         processed = BatchFeature({input_name: speech_inputs})
-
-#         feat_extract.feature_size = feat_extract.num_mel_bins  # hack!
-
-#         processed = feat_extract.pad(processed, padding="longest", return_tensors="np")
-#         self.assertIn("attention_mask", processed)
-#         self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
-#         self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lengths)
-
-#     def test_attention_mask_with_truncation_target(self):
-#         feat_dict = self.feat_extract_dict
-#         feat_dict["return_attention_mask"] = True
-#         feat_extract = self.feature_extraction_class(**feat_dict)
-#         speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
-#         input_lengths = [len(x) for x in speech_inputs]
-#         input_name = feat_extract.model_input_names[0]
-
-#         processed = BatchFeature({input_name: speech_inputs})
-#         max_length = min(input_lengths)
-
-#         feat_extract.feature_size = feat_extract.num_mel_bins  # hack!
-
-#         processed_pad = feat_extract.pad(
-#             processed, padding="max_length", max_length=max_length, truncation=True, return_tensors="np"
-#         )
-#         self.assertIn("attention_mask", processed_pad)
-#         self.assertListEqual(
-#             list(processed_pad.attention_mask.shape), [processed_pad[input_name].shape[0], max_length]
-#         )
-#         self.assertListEqual(
-#             processed_pad.attention_mask[:, :max_length].sum(-1).tolist(), [max_length for x in speech_inputs]
-#         )
-
-#     def _load_datasamples(self, num_samples):
-#         from datasets import load_dataset
-
-#         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-#         # automatic decoding with librispeech
-#         speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-#         return [x["array"] for x in speech_samples]
-
-#     def test_integration(self):
-#         # fmt: off
-#         EXPECTED_INPUT_VALUES = mindspore.tensor(
-#             [2.3804e-03, 2.0752e-03, 1.9836e-03, 2.1057e-03, 1.6174e-03,
-#              3.0518e-04, 9.1553e-05, 3.3569e-04, 9.7656e-04, 1.8311e-03,
-#              2.0142e-03, 2.1057e-03, 1.7395e-03, 4.5776e-04, -3.9673e-04,
-#              4.5776e-04, 1.0071e-03, 9.1553e-05, 4.8828e-04, 1.1597e-03,
-#              7.3242e-04, 9.4604e-04, 1.8005e-03, 1.8311e-03, 8.8501e-04,
-#              4.2725e-04, 4.8828e-04, 7.3242e-04, 1.0986e-03, 2.1057e-03]
-#         )
-#         # fmt: on
-
-#         input_speech = self._load_datasamples(1)
-#         feature_extractor = SpeechT5FeatureExtractor()
-#         input_values = feature_extractor(input_speech, return_tensors="ms").input_values
-#         self.assertEqual(input_values.shape, (1, 93680))
-#         self.assertTrue(np.allclose(input_values[0, :30].asnumpy(), EXPECTED_INPUT_VALUES.asnumpy(), atol=1e-6))
-
-#     def test_integration_target(self):
-#         # fmt: off
-#         EXPECTED_INPUT_VALUES = mindspore.tensor(
-#             [-2.6870, -3.0104, -3.1356, -3.5352, -3.0044, -3.0353, -3.4719, -3.6777,
-#              -3.1520, -2.9435, -2.6553, -2.8795, -2.9944, -2.5921, -3.0279, -3.0386,
-#              -3.0864, -3.1291, -3.2353, -2.7444, -2.6831, -2.7287, -3.1761, -3.1571,
-#              -3.2726, -3.0582, -3.1007, -3.4533, -3.4695, -3.0998]
-#         )
-#         # fmt: on
-
-#         input_speech = self._load_datasamples(1)
-#         feature_extractor = SpeechT5FeatureExtractor()
-#         input_values = feature_extractor(audio_target=input_speech, return_tensors="ms").input_values
-#         self.assertEqual(input_values.shape, (1, 366, 80))
-#         self.assertTrue(np.allclose(input_values[0, 0, :30].asnumpy(), EXPECTED_INPUT_VALUES.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/speecht5/test_modeling_speecht5.py b/tests/transformers/models/speecht5/test_modeling_speecht5.py
deleted file mode 100644
index c0b75cfae..000000000
--- a/tests/transformers/models/speecht5/test_modeling_speecht5.py
+++ /dev/null
@@ -1,1871 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore SpeechT5 model."""
-
-import copy
-import inspect
-import tempfile
-import unittest
-import numpy as np
-
-from mindnlp.transformers import SpeechT5Config, SpeechT5HifiGanConfig
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-from mindnlp.engine import set_seed
-from mindnlp.utils import cached_property
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        SpeechT5ForSpeechToSpeech,
-        SpeechT5ForSpeechToText,
-        SpeechT5ForTextToSpeech,
-        SpeechT5HifiGan,
-        SpeechT5Model,
-        SpeechT5Processor,
-    )
-
-def prepare_inputs_dict(
-    config,
-    input_ids=None,
-    input_values=None,
-    decoder_input_ids=None,
-    decoder_input_values=None,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if input_ids is not None:
-        encoder_dict = {"input_ids": input_ids}
-    else:
-        encoder_dict = {"input_values": input_values}
-
-    if decoder_input_ids is not None:
-        decoder_dict = {"decoder_input_ids": decoder_input_ids}
-    else:
-        decoder_dict = {"decoder_input_values": decoder_input_values}
-
-    if head_mask is None:
-        head_mask = ops.ones((config.encoder_layers, config.encoder_attention_heads))
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones((config.decoder_layers, config.decoder_attention_heads))
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones((config.decoder_layers, config.decoder_attention_heads))
-
-    return {
-        **encoder_dict,
-        **decoder_dict,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": decoder_attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-@require_mindspore
-class SpeechT5ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=False,
-        vocab_size=81,
-        hidden_size=24,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        intermediate_size=4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length, self.hidden_size], scale=1.0)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        decoder_input_values = floats_tensor([self.batch_size, self.seq_length, self.hidden_size], scale=1.0)
-        decoder_attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-        inputs_dict = prepare_inputs_dict(
-            config,
-            input_values=input_values,
-            decoder_input_values=decoder_input_values,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_config(self):
-        return SpeechT5Config(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-        )
-
-    def create_and_check_model_forward(self, config, inputs_dict):
-        model = SpeechT5Model(config=config).set_train(False)
-
-        input_values = inputs_dict["input_values"]
-        attention_mask = inputs_dict["attention_mask"]
-        decoder_input_values = inputs_dict["decoder_input_values"]
-
-        result = model(input_values, attention_mask=attention_mask, decoder_input_values=decoder_input_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-
-@require_mindspore
-class SpeechT5ModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (SpeechT5Model,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"automatic-speech-recognition": SpeechT5ForSpeechToText, "feature-extraction": SpeechT5Model}
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    test_pruning = False
-    test_headmasking = False
-    test_resize_embeddings = False
-
-    input_name = "input_values"
-
-    def setUp(self):
-        self.model_tester = SpeechT5ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_forward(*config_and_inputs)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_values",
-                "attention_mask",
-                "decoder_input_values",
-                "decoder_attention_mask",
-            ]
-            expected_arg_names.extend(
-                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                else ["encoder_outputs"]
-            )
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    # this model has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # this model has no input embeddings
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
-        pass
-
-    @slow
-    def test_torchscript_output_attentions(self):
-        # disabled because this model doesn't have decoder_input_ids
-        pass
-
-    @slow
-    def test_torchscript_output_hidden_state(self):
-        # disabled because this model doesn't have decoder_input_ids
-        pass
-
-    @slow
-    def test_torchscript_simple(self):
-        # disabled because this model doesn't have decoder_input_ids
-        pass
-
-
-@require_mindspore
-class SpeechT5ForSpeechToTextTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        encoder_seq_length=1024,  # speech is longer
-        decoder_seq_length=7,
-        is_training=False,
-        hidden_size=24,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        intermediate_size=4,
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        vocab_size=81,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.vocab_size = vocab_size
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.encoder_seq_length], scale=1.0)
-        attention_mask = random_attention_mask([self.batch_size, self.encoder_seq_length])
-
-        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size).clamp(2)
-        decoder_attention_mask = random_attention_mask([self.batch_size, self.decoder_seq_length])
-
-        config = self.get_config()
-        inputs_dict = prepare_inputs_dict(
-            config,
-            input_values=input_values,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_config(self):
-        return SpeechT5Config(
-            hidden_size=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            vocab_size=self.vocab_size,
-        )
-
-    def create_and_check_model_forward(self, config, inputs_dict):
-        model = SpeechT5ForSpeechToText(config=config).set_train(False)
-
-        input_values = inputs_dict["input_values"]
-        attention_mask = inputs_dict["attention_mask"]
-        decoder_input_ids = inputs_dict["decoder_input_ids"]
-
-        result = model(input_values, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.decoder_seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = SpeechT5ForSpeechToText(config=config).get_decoder().set_train(False)
-        input_ids = inputs_dict["decoder_input_ids"]
-        attention_mask = inputs_dict["decoder_attention_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size).clamp(2)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-2))
-
-
-@require_mindspore
-class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (SpeechT5ForSpeechToText,) if is_mindspore_available() else ()
-    all_generative_model_classes = (SpeechT5ForSpeechToText,) if is_mindspore_available() else ()
-    is_encoder_decoder = True
-    test_pruning = False
-    test_headmasking = False
-
-    input_name = "input_values"
-
-    def setUp(self):
-        self.model_tester = SpeechT5ForSpeechToTextTester(self)
-        self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_model_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_forward(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-
-            subsampled_encoder_seq_length = model.speecht5.encoder.prenet._get_feat_extract_output_lengths(
-                encoder_seq_length
-            )
-            subsampled_encoder_key_length = model.speecht5.encoder.prenet._get_feat_extract_output_lengths(
-                encoder_key_length
-            )
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 5
-
-            # loss is at first position
-            if "labels" in inputs_dict:
-                correct_outlen += 1  # loss is added to beginning
-            if "past_key_values" in outputs:
-                correct_outlen += 1  # past_key_values have been returned
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    decoder_seq_length,
-                    subsampled_encoder_key_length,
-                ],
-            )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            added_hidden_states = 2
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
-            )
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_values",
-                "attention_mask",
-                "decoder_input_ids",
-                "decoder_attention_mask",
-            ]
-            expected_arg_names.extend(
-                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                else ["encoder_outputs"]
-            )
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            subsampled_seq_length = model.speecht5.encoder.prenet._get_feat_extract_output_lengths(seq_length)
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [subsampled_seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # this model has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    def test_resize_embeddings_untied(self):
-        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        if not self.test_resize_embeddings:
-            return
-
-        original_config.tie_word_embeddings = False
-
-        # if model cannot untied embeddings -> leave test
-        if original_config.tie_word_embeddings:
-            return
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            # if no output embeddings -> leave test
-            if model.get_output_embeddings() is None:
-                continue
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_vocab_size = config.vocab_size
-            model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = ops.clamp(inputs_dict["decoder_input_ids"], max=model_vocab_size - 15 - 1)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-    def test_resize_tokens_embeddings(self):
-        original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            return
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            if self.model_tester.is_training is False:
-                model.set_train(False)
-
-            model_vocab_size = config.vocab_size
-            # Retrieve the embeddings and clone theme
-            model_embed = model.resize_token_embeddings(model_vocab_size)
-            cloned_embeddings = model_embed.weight.clone()
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
-
-            # make sure that decoder_input_ids are resized
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = ops.clamp(inputs_dict["decoder_input_ids"], max=model_vocab_size - 15 - 1)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                if p1.ne(p2).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
-        pass
-
-    # training is not supported yet
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, call):
-        if hasattr(call, "weight") and call.weight is not None:
-            call.weight.data.fill(3)
-        if hasattr(call, "weight_g") and call.weight_g is not None:
-            call.weight_g.data.fill(3)
-        if hasattr(call, "weight_v") and call.weight_v is not None:
-            call.weight_v.data.fill(3)
-        if hasattr(call, "has_bias") and call.has_bias:
-            call.bias.data.fill(3)
-        if hasattr(call, "masked_spec_embed") and call.masked_spec_embed is not None:
-            call.masked_spec_embed.data.fill(3)
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-@slow
-class SpeechT5ForSpeechToTextIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_processor(self):
-        return SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
-
-    def _load_datasamples(self, num_samples):
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_generation_librispeech(self):
-        model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")
-        processor = self.default_processor
-
-        input_speech = self._load_datasamples(1)
-
-        input_values = processor(audio=input_speech, return_tensors="ms").input_values
-
-        generated_ids = model.generate(input_values)
-        generated_transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"
-        ]
-        self.assertListEqual(generated_transcript, EXPECTED_TRANSCRIPTIONS)
-
-    def test_generation_librispeech_batched(self):
-        model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")
-        processor = self.default_processor
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(audio=input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-
-        generated_ids = model.generate(input_values, attention_mask=attention_mask)
-        generated_transcripts = processor.batch_decode(generated_ids, skip_special_tokens=True)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
-            "nor is mister quilter's manner less interesting than his matter",
-            "he tells us that at this festive season of the year with christmas and rosebeaf looming before us"
-            " similars drawn from eating and its results occur most readily to the mind",
-            "he has grave doubts whether sir frederick latin's work is really greek after all and can discover in it"
-            " but little of rocky ithica",
-        ]
-        self.assertListEqual(generated_transcripts, EXPECTED_TRANSCRIPTIONS)
-
-
-@require_mindspore
-class SpeechT5ForTextToSpeechTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        encoder_seq_length=7,
-        decoder_seq_length=1024,  # speech is longer
-        is_training=False,
-        hidden_size=24,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        intermediate_size=4,
-        vocab_size=81,
-        num_mel_bins=20,
-        reduction_factor=2,
-        speech_decoder_postnet_layers=2,
-        speech_decoder_postnet_units=32,
-        speech_decoder_prenet_units=32,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.vocab_size = vocab_size
-        self.num_mel_bins = num_mel_bins
-        self.reduction_factor = reduction_factor
-        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
-        self.speech_decoder_postnet_units = speech_decoder_postnet_units
-        self.speech_decoder_prenet_units = speech_decoder_prenet_units
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size).clamp(2)
-        attention_mask = random_attention_mask([self.batch_size, self.encoder_seq_length])
-
-        decoder_input_values = floats_tensor([self.batch_size, self.decoder_seq_length, self.num_mel_bins], scale=1.0)
-        decoder_attention_mask = random_attention_mask([self.batch_size, self.decoder_seq_length])
-
-        config = self.get_config()
-        inputs_dict = prepare_inputs_dict(
-            config,
-            input_ids=input_ids,
-            decoder_input_values=decoder_input_values,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_config(self):
-        return SpeechT5Config(
-            hidden_size=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            vocab_size=self.vocab_size,
-            num_mel_bins=self.num_mel_bins,
-            reduction_factor=self.reduction_factor,
-            speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
-            speech_decoder_postnet_units=self.speech_decoder_postnet_units,
-            speech_decoder_prenet_units=self.speech_decoder_prenet_units,
-        )
-
-    def create_and_check_model_forward(self, config, inputs_dict):
-        model = SpeechT5ForTextToSpeech(config=config).set_train(False)
-
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-        decoder_input_values = inputs_dict["decoder_input_values"]
-
-        result = model(input_ids, attention_mask=attention_mask, decoder_input_values=decoder_input_values)
-        self.parent.assertEqual(
-            result.spectrogram.shape,
-            (self.batch_size, self.decoder_seq_length * self.reduction_factor, self.num_mel_bins),
-        )
-
-
-@require_mindspore
-class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (SpeechT5ForTextToSpeech,) if is_mindspore_available() else ()
-    all_generative_model_classes = (SpeechT5ForTextToSpeech,) if is_mindspore_available() else ()
-    is_encoder_decoder = True
-    test_pruning = False
-    test_headmasking = False
-
-    input_name = "input_ids"
-
-    def setUp(self):
-        self.model_tester = SpeechT5ForTextToSpeechTester(self)
-        self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_model_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_forward(*config_and_inputs)
-
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
-    def test_decoder_model_past_with_large_inputs(self):
-        pass
-
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
-    def test_determinism(self):
-        pass
-
-    @unittest.skip("skipped because there is always dropout in SpeechT5SpeechDecoderPrenet")
-    def test_batching_equivalence(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_ids",
-                "attention_mask",
-                "decoder_input_values",
-                "decoder_attention_mask",
-            ]
-            expected_arg_names.extend(
-                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                else ["encoder_outputs"]
-            )
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                uniform_init_parms = [
-                    "conv.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # this model has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
-    def test_model_outputs_equivalence(self):
-        pass
-
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
-    def test_save_load(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
-        pass
-
-    @slow
-    def test_torchscript_output_attentions(self):
-        # disabled because this model doesn't have decoder_input_ids
-        pass
-
-    @slow
-    def test_torchscript_output_hidden_state(self):
-        # disabled because this model doesn't have decoder_input_ids
-        pass
-
-    @slow
-    def test_torchscript_simple(self):
-        # disabled because this model doesn't have decoder_input_ids
-        pass
-
-    # training is not supported yet
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, cell):
-        if hasattr(cell, "weight") and cell.weight is not None:
-            cell.weight.data.fill(3)
-        if hasattr(cell, "weight_g") and cell.weight_g is not None:
-            cell.weight_g.data.fill(3)
-        if hasattr(cell, "weight_v") and cell.weight_v is not None:
-            cell.weight_v.data.fill(3)
-        if hasattr(cell, "has_bias") and cell.has_bias:
-            cell.bias.data.fill(3)
-
-
-@slow
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class SpeechT5ForTextToSpeechIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_model(self):
-        return SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
-
-    @cached_property
-    def default_processor(self):
-        return SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-
-    @cached_property
-    def default_vocoder(self):
-        return SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-
-    def test_generation(self):
-        model = self.default_model
-        processor = self.default_processor
-
-        input_text = "Mister Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
-        input_ids = processor(text=input_text, return_tensors="ms").input_ids
-        speaker_embeddings = ops.zeros((1, 512))
-
-        # Generate speech and validate output dimensions
-        set_seed(555)  # Ensure deterministic behavior
-        generated_speech = model.generate_speech(input_ids, speaker_embeddings=speaker_embeddings)
-        num_mel_bins = model.config.num_mel_bins
-        self.assertEqual(
-            generated_speech.shape[1], num_mel_bins, "Generated speech output has an unexpected number of mel bins."
-        )
-
-        # Validate generation with additional kwargs using model.generate;
-        # same method than generate_speech
-        set_seed(555)  # Reset seed for consistent results
-        generated_speech_with_generate = model.generate(
-            input_ids, attention_mask=None, speaker_embeddings=speaker_embeddings
-        )
-        self.assertEqual(
-            generated_speech_with_generate.shape,
-            generated_speech.shape,
-            "Shape mismatch between generate_speech and generate methods.",
-        )
-
-    def test_one_to_many_generation(self):
-        model = self.default_model
-        processor = self.default_processor
-        vocoder = self.default_vocoder
-
-        input_text = [
-            "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
-            "nor is mister quilter's manner less interesting than his matter",
-            "he tells us that at this festive season of the year with christmas and rosebeaf looming before us",
-        ]
-        inputs = processor(text=input_text, padding="max_length", max_length=128, return_tensors="ms")
-        speaker_embeddings = ops.zeros((1, 512))
-
-        # Generate spectrograms
-        set_seed(555)  # Ensure deterministic behavior
-        spectrograms, spectrogram_lengths = model.generate_speech(
-            input_ids=inputs["input_ids"],
-            speaker_embeddings=speaker_embeddings,
-            attention_mask=inputs["attention_mask"],
-            return_output_lengths=True,
-        )
-
-        # Validate generated spectrogram dimensions
-        expected_batch_size = len(input_text)
-        num_mel_bins = model.config.num_mel_bins
-        actual_batch_size, _, actual_num_mel_bins = spectrograms.shape
-        self.assertEqual(actual_batch_size, expected_batch_size, "Batch size of generated spectrograms is incorrect.")
-        self.assertEqual(
-            actual_num_mel_bins, num_mel_bins, "Number of mel bins in batch generated spectrograms is incorrect."
-        )
-
-        # Generate waveforms using the vocoder
-        waveforms = vocoder(spectrograms)
-        waveform_lengths = [int(waveforms.shape[1] / max(spectrogram_lengths)) * i for i in spectrogram_lengths]
-
-        # Validate generation with integrated vocoder
-        set_seed(555)  # Reset seed for consistent results
-        waveforms_with_vocoder, waveform_lengths_with_vocoder = model.generate_speech(
-            input_ids=inputs["input_ids"],
-            speaker_embeddings=speaker_embeddings,
-            attention_mask=inputs["attention_mask"],
-            vocoder=vocoder,
-            return_output_lengths=True,
-        )
-        # Check consistency between waveforms generated with and without standalone vocoder
-        self.assertTrue(
-            np.allclose(waveforms.asnumpy(), waveforms_with_vocoder.asnumpy(), atol=5e-4),
-            "Mismatch in waveforms generated with and without the standalone vocoder.",
-        )
-        self.assertEqual(
-            waveform_lengths,
-            waveform_lengths_with_vocoder,
-            "Waveform lengths differ between standalone and integrated vocoder generation.",
-        )
-
-        # Test generation consistency without returning lengths
-        set_seed(555)  # Reset seed for consistent results
-        waveforms_with_vocoder_no_lengths = model.generate_speech(
-            input_ids=inputs["input_ids"],
-            speaker_embeddings=speaker_embeddings,
-            attention_mask=inputs["attention_mask"],
-            vocoder=vocoder,
-            return_output_lengths=False,
-        )
-
-        # Validate waveform consistency without length information
-        self.assertTrue(
-            np.allclose(waveforms_with_vocoder_no_lengths.asnumpy(), waveforms_with_vocoder.asnumpy(), atol=1e-4),
-            "Waveforms differ when generated with and without length information.",
-        )
-
-        # Validate batch vs. single instance generation consistency
-        for i, text in enumerate(input_text):
-            inputs = processor(text=text, padding="max_length", max_length=128, return_tensors="ms")
-            set_seed(555)  # Reset seed for consistent results
-            spectrogram = model.generate_speech(
-                input_ids=inputs["input_ids"],
-                speaker_embeddings=speaker_embeddings,
-            )
-
-            # Check spectrogram shape consistency
-            self.assertEqual(
-                spectrogram.shape,
-                spectrograms[i][: spectrogram_lengths[i]].shape,
-                "Mismatch in spectrogram shape between batch and single instance generation.",
-            )
-
-            # Generate and validate waveform for single instance
-            waveform = vocoder(spectrogram)
-            self.assertEqual(
-                waveform.shape,
-                waveforms[i][: waveform_lengths[i]].shape,
-                "Mismatch in waveform shape between batch and single instance generation.",
-            )
-
-            # Check waveform consistency with integrated vocoder
-            set_seed(555)  # Reset seed for consistent results
-            waveform_with_integrated_vocoder = model.generate_speech(
-                input_ids=inputs["input_ids"],
-                speaker_embeddings=speaker_embeddings,
-                vocoder=vocoder,
-            )
-            self.assertTrue(
-                np.allclose(waveform.asnumpy(), waveform_with_integrated_vocoder.asnumpy(), atol=1e-5),
-                "Mismatch in waveform between standalone and integrated vocoder for single instance generation.",
-            )
-
-    def test_batch_generation(self):
-        model = self.default_model
-        processor = self.default_processor
-        vocoder = self.default_vocoder
-
-        input_text = [
-            "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
-            "nor is mister quilter's manner less interesting than his matter",
-            "he tells us that at this festive season of the year with christmas and rosebeaf looming before us",
-        ]
-        inputs = processor(text=input_text, padding="max_length", max_length=128, return_tensors="ms")
-        set_seed(555)  # Ensure deterministic behavior
-        speaker_embeddings = ops.randn((len(input_text), 512))
-
-        # Generate spectrograms
-        set_seed(555)  # Reset seed for consistent results
-        spectrograms, spectrogram_lengths = model.generate_speech(
-            input_ids=inputs["input_ids"],
-            speaker_embeddings=speaker_embeddings,
-            attention_mask=inputs["attention_mask"],
-            return_output_lengths=True,
-        )
-
-        # Validate generated spectrogram dimensions
-        expected_batch_size = len(input_text)
-        num_mel_bins = model.config.num_mel_bins
-        actual_batch_size, _, actual_num_mel_bins = spectrograms.shape
-        self.assertEqual(
-            actual_batch_size,
-            expected_batch_size,
-            "Batch size of generated spectrograms is incorrect.",
-        )
-        self.assertEqual(
-            actual_num_mel_bins,
-            num_mel_bins,
-            "Number of mel bins in batch generated spectrograms is incorrect.",
-        )
-
-        # Generate waveforms using the vocoder
-        waveforms = vocoder(spectrograms)
-        waveform_lengths = [int(waveforms.shape[1] / max(spectrogram_lengths)) * i for i in spectrogram_lengths]
-
-        # Validate generation with integrated vocoder
-        set_seed(555)  # Reset seed for consistent results
-        waveforms_with_vocoder, waveform_lengths_with_vocoder = model.generate_speech(
-            input_ids=inputs["input_ids"],
-            speaker_embeddings=speaker_embeddings,
-            attention_mask=inputs["attention_mask"],
-            vocoder=vocoder,
-            return_output_lengths=True,
-        )
-        # raise ValueError(f"{waveforms, waveforms_with_vocoder}")
-        # Check consistency between waveforms generated with and without standalone vocoder
-        self.assertTrue(
-            np.allclose(waveforms.asnumpy(), waveforms_with_vocoder.asnumpy(), atol=1e-5),
-            "Mismatch in waveforms generated with and without the standalone vocoder.",
-        )
-        self.assertEqual(
-            waveform_lengths,
-            waveform_lengths_with_vocoder,
-            "Waveform lengths differ between standalone and integrated vocoder generation.",
-        )
-
-        # Test generation consistency without returning lengths
-        set_seed(555)  # Reset seed for consistent results
-        waveforms_with_vocoder_no_lengths = model.generate_speech(
-            input_ids=inputs["input_ids"],
-            speaker_embeddings=speaker_embeddings,
-            attention_mask=inputs["attention_mask"],
-            vocoder=vocoder,
-            return_output_lengths=False,
-        )
-
-        # Validate waveform consistency without length information
-        self.assertTrue(
-            np.allclose(waveforms_with_vocoder_no_lengths.asnumpy(), waveforms_with_vocoder.asnumpy(), atol=5e-4),
-            "Waveforms differ when generated with and without length information.",
-        )
-
-        # Validate batch vs. single instance generation consistency
-        for i, text in enumerate(input_text):
-            inputs = processor(text=text, padding="max_length", max_length=128, return_tensors="ms")
-            current_speaker_embedding = speaker_embeddings[i].unsqueeze(0)
-            set_seed(555)  # Reset seed for consistent results
-            spectrogram = model.generate_speech(
-                input_ids=inputs["input_ids"],
-                speaker_embeddings=current_speaker_embedding,
-            )
-
-            # Check spectrogram shape consistency
-            self.assertEqual(
-                spectrogram.shape,
-                spectrograms[i][: spectrogram_lengths[i]].shape,
-                "Mismatch in spectrogram shape between batch and single instance generation.",
-            )
-
-            # Generate and validate waveform for single instance
-            waveform = vocoder(spectrogram)
-            self.assertEqual(
-                waveform.shape,
-                waveforms[i][: waveform_lengths[i]].shape,
-                "Mismatch in waveform shape between batch and single instance generation.",
-            )
-
-            # Check waveform consistency with integrated vocoder
-            set_seed(555)  # Reset seed for consistent results
-            waveform_with_integrated_vocoder = model.generate_speech(
-                input_ids=inputs["input_ids"],
-                speaker_embeddings=current_speaker_embedding,
-                vocoder=vocoder,
-            )
-            self.assertTrue(
-                np.allclose(waveform.asnumpy(), waveform_with_integrated_vocoder.asnumpy(), atol=1e-5),
-                "Mismatch in waveform between standalone and integrated vocoder for single instance generation.",
-            )
-
-
-@require_mindspore
-class SpeechT5ForSpeechToSpeechTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        encoder_seq_length=1024,  # speech is longer
-        decoder_seq_length=1024,
-        is_training=False,
-        hidden_size=24,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        intermediate_size=4,
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        vocab_size=81,
-        num_mel_bins=20,
-        reduction_factor=2,
-        speech_decoder_postnet_layers=2,
-        speech_decoder_postnet_units=32,
-        speech_decoder_prenet_units=32,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.vocab_size = vocab_size
-        self.num_mel_bins = num_mel_bins
-        self.reduction_factor = reduction_factor
-        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
-        self.speech_decoder_postnet_units = speech_decoder_postnet_units
-        self.speech_decoder_prenet_units = speech_decoder_prenet_units
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.encoder_seq_length], scale=1.0)
-        attention_mask = random_attention_mask([self.batch_size, self.encoder_seq_length])
-
-        decoder_input_values = floats_tensor([self.batch_size, self.decoder_seq_length, self.num_mel_bins], scale=1.0)
-        decoder_attention_mask = random_attention_mask([self.batch_size, self.decoder_seq_length])
-
-        config = self.get_config()
-        inputs_dict = prepare_inputs_dict(
-            config,
-            input_values=input_values,
-            decoder_input_values=decoder_input_values,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_config(self):
-        return SpeechT5Config(
-            hidden_size=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            vocab_size=self.vocab_size,
-            num_mel_bins=self.num_mel_bins,
-            reduction_factor=self.reduction_factor,
-            speech_decoder_postnet_layers=self.speech_decoder_postnet_layers,
-            speech_decoder_postnet_units=self.speech_decoder_postnet_units,
-            speech_decoder_prenet_units=self.speech_decoder_prenet_units,
-        )
-
-    def create_and_check_model_forward(self, config, inputs_dict):
-        model = SpeechT5ForSpeechToSpeech(config=config).set_train(False)
-
-        input_values = inputs_dict["input_values"]
-        attention_mask = inputs_dict["attention_mask"]
-        decoder_input_values = inputs_dict["decoder_input_values"]
-
-        result = model(input_values, attention_mask=attention_mask, decoder_input_values=decoder_input_values)
-        self.parent.assertEqual(
-            result.spectrogram.shape,
-            (self.batch_size, self.decoder_seq_length * self.reduction_factor, self.num_mel_bins),
-        )
-
-
-@require_mindspore
-class SpeechT5ForSpeechToSpeechTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (SpeechT5ForSpeechToSpeech,) if is_mindspore_available() else ()
-    all_generative_model_classes = (SpeechT5ForSpeechToSpeech,) if is_mindspore_available() else ()
-    is_encoder_decoder = True
-    test_pruning = False
-    test_headmasking = False
-    test_resize_embeddings = False
-
-    input_name = "input_values"
-
-    def setUp(self):
-        self.model_tester = SpeechT5ForSpeechToSpeechTester(self)
-        self.config_tester = ConfigTester(self, config_class=SpeechT5Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_model_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_forward(*config_and_inputs)
-
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
-    def test_decoder_model_past_with_large_inputs(self):
-        pass
-
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
-    def test_determinism(self):
-        pass
-
-    @unittest.skip("skipped because there is always dropout in SpeechT5SpeechDecoderPrenet")
-    def test_batching_equivalence(self):
-        pass
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-
-            subsampled_encoder_seq_length = model.speecht5.encoder.prenet._get_feat_extract_output_lengths(
-                encoder_seq_length
-            )
-            subsampled_encoder_key_length = model.speecht5.encoder.prenet._get_feat_extract_output_lengths(
-                encoder_key_length
-            )
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 5
-
-            # loss is at first position
-            if "labels" in inputs_dict:
-                correct_outlen += 1  # loss is added to beginning
-            if "past_key_values" in outputs:
-                correct_outlen += 1  # past_key_values have been returned
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    decoder_seq_length,
-                    subsampled_encoder_key_length,
-                ],
-            )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            added_hidden_states = 2
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
-            )
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_values",
-                "attention_mask",
-                "decoder_input_values",
-                "decoder_attention_mask",
-            ]
-            expected_arg_names.extend(
-                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                else ["encoder_outputs"]
-            )
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            subsampled_seq_length = model.speecht5.encoder.prenet._get_feat_extract_output_lengths(seq_length)
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [subsampled_seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # this model has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # this model has no input embeddings
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
-    def test_model_outputs_equivalence(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # decoder cannot keep gradients
-        pass
-
-    # skipped because there is always dropout in SpeechT5SpeechDecoderPrenet
-    def test_save_load(self):
-        pass
-
-    @slow
-    def test_torchscript_output_attentions(self):
-        # disabled because this model doesn't have decoder_input_ids
-        pass
-
-    @slow
-    def test_torchscript_output_hidden_state(self):
-        # disabled because this model doesn't have decoder_input_ids
-        pass
-
-    @slow
-    def test_torchscript_simple(self):
-        # disabled because this model doesn't have decoder_input_ids
-        pass
-
-    # training is not supported yet
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, cell):
-        if hasattr(cell, "weight") and cell.weight is not None:
-            cell.weight.data.fill(3)
-        if hasattr(cell, "weight_g") and cell.weight_g is not None:
-            cell.weight_g.data.fill(3)
-        if hasattr(cell, "weight_v") and cell.weight_v is not None:
-            cell.weight_v.data.fill(3)
-        if hasattr(cell, "has_bias") and cell.has_bias:
-            cell.bias.data.fill(3)
-        if hasattr(cell, "masked_spec_embed") and cell.masked_spec_embed is not None:
-            cell.masked_spec_embed.data.fill(3)
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-@slow
-class SpeechT5ForSpeechToSpeechIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_processor(self):
-        return SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
-
-    def _load_datasamples(self, num_samples):
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_generation_librispeech(self):
-        model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
-        processor = self.default_processor
-
-        input_speech = self._load_datasamples(1)
-        input_values = processor(audio=input_speech, return_tensors="ms").input_values
-
-        speaker_embeddings = ops.zeros((1, 512))
-        # raise ValueError("input_values shape:", input_values.shape) # (1, 93680)
-        generated_speech = model.generate_speech(input_values, speaker_embeddings=speaker_embeddings)
-
-        self.assertEqual(generated_speech.shape[1], model.config.num_mel_bins)
-        self.assertGreaterEqual(generated_speech.shape[0], 300)
-        self.assertLessEqual(generated_speech.shape[0], 310)
-
-
-class SpeechT5HifiGanTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=False,
-        num_mel_bins=20,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.num_mel_bins = num_mel_bins
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.seq_length, self.num_mel_bins], scale=1.0)
-        config = self.get_config()
-        return config, input_values
-
-    def get_config(self):
-        return SpeechT5HifiGanConfig(
-            model_in_dim=self.num_mel_bins,
-            upsample_initial_channel=32,
-        )
-
-    def create_and_check_model(self, config, input_values):
-        model = SpeechT5HifiGan(config=config).set_train(False)
-        result = model(input_values)
-        self.parent.assertEqual(result.shape, (self.seq_length * 256,))
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values = self.prepare_config_and_inputs()
-        inputs_dict = {"spectrogram": input_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class SpeechT5HifiGanTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (SpeechT5HifiGan,) if is_mindspore_available() else ()
-    test_torchscript = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_resize_position_embeddings = False
-    test_head_masking = False
-    test_mismatched_shapes = False
-    test_missing_keys = False
-    test_model_parallel = False
-    is_encoder_decoder = False
-    has_attentions = False
-
-    input_name = "spectrogram"
-
-    def setUp(self):
-        self.model_tester = SpeechT5HifiGanTester(self)
-        self.config_tester = ConfigTester(self, config_class=SpeechT5HifiGanConfig)
-
-    def test_config(self):
-        self.config_tester.create_and_test_config_to_json_string()
-        self.config_tester.create_and_test_config_to_json_file()
-        self.config_tester.create_and_test_config_from_and_save_pretrained()
-        self.config_tester.create_and_test_config_from_and_save_pretrained_subfolder()
-        self.config_tester.create_and_test_config_with_num_labels()
-        self.config_tester.check_config_can_be_init_without_params()
-        self.config_tester.check_config_arguments_init()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "spectrogram",
-            ]
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    # this model does not output hidden states
-    def test_hidden_states_output(self):
-        pass
-
-    # skip
-    def test_initialization(self):
-        pass
-
-    # this model has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # this model has no input embeddings
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # skip as this model doesn't support all arguments tested
-    def test_model_outputs_equivalence(self):
-        pass
-
-    # this model does not output hidden states
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    # skip because it fails on automapping of SpeechT5HifiGanConfig
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    # skip because it fails on automapping of SpeechT5HifiGanConfig
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    def test_batched_inputs_outputs(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-
-            batched_inputs = inputs["spectrogram"].unsqueeze(0).tile((2, 1, 1))
-            batched_outputs = model(batched_inputs)
-
-            self.assertEqual(
-                batched_inputs.shape[0], batched_outputs.shape[0], msg="Got different batch dims for input and output"
-            )
-
-    def test_unbatched_inputs_outputs(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(inputs["spectrogram"])
-            self.assertTrue(outputs.dim() == 1, msg="Got un-batched inputs but batched output")
diff --git a/tests/transformers/models/speecht5/test_processor_speecht5.py b/tests/transformers/models/speecht5/test_processor_speecht5.py
deleted file mode 100644
index d47395787..000000000
--- a/tests/transformers/models/speecht5/test_processor_speecht5.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for the SpeechT5 processors."""
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-
-from mindnlp.transformers.models.speecht5 import SpeechT5Tokenizer
-from mindnlp.utils.testing_utils import get_tests_dir, require_mindspore
-from mindnlp.utils import is_mindspore_available
-from mindnlp.configs import FEATURE_EXTRACTOR_NAME
-
-
-if is_mindspore_available():
-    from mindnlp.transformers import SpeechT5FeatureExtractor, SpeechT5Processor
-
-    from .test_feature_extraction_speecht5 import floats_list
-
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe_char.model")
-
-
-@require_mindspore
-class SpeechT5ProcessorTest(unittest.TestCase):
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-
-        tokenizer = SpeechT5Tokenizer(SAMPLE_VOCAB)
-        tokenizer.save_pretrained(self.tmpdirname)
-
-        feature_extractor_map = {
-            "feature_size": 1,
-            "padding_value": 0.0,
-            "sampling_rate": 16000,
-            "do_normalize": False,
-            "num_mel_bins": 80,
-            "hop_length": 16,
-            "win_length": 64,
-            "win_function": "hann_window",
-            "fmin": 80,
-            "fmax": 7600,
-            "mel_floor": 1e-10,
-            "reduction_factor": 2,
-            "return_attention_mask": True,
-        }
-
-        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
-        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(feature_extractor_map) + "\n")
-
-    def get_tokenizer(self, **kwargs):
-        return SpeechT5Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_feature_extractor(self, **kwargs):
-        return SpeechT5FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def test_save_load_pretrained_default(self):
-        tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        processor.save_pretrained(self.tmpdirname)
-        processor = SpeechT5Processor.from_pretrained(self.tmpdirname)
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertIsInstance(processor.tokenizer, SpeechT5Tokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, SpeechT5FeatureExtractor)
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = SpeechT5Processor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
-
-        processor = SpeechT5Processor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, SpeechT5Tokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, SpeechT5FeatureExtractor)
-
-    def test_feature_extractor(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        raw_speech = floats_list((3, 1000))
-
-        input_feat_extract = feature_extractor(audio=raw_speech, return_tensors="np")
-        input_processor = processor(audio=raw_speech, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_feature_extractor_target(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        raw_speech = floats_list((3, 1000))
-
-        input_feat_extract = feature_extractor(audio_target=raw_speech, return_tensors="np")
-        input_processor = processor(audio_target=raw_speech, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_tokenizer(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        input_str = "This is a test string"
-
-        encoded_processor = processor(text=input_str)
-        encoded_tok = tokenizer(input_str)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_tokenizer_target(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        input_str = "This is a test string"
-
-        encoded_processor = processor(text_target=input_str)
-        encoded_tok = tokenizer(input_str)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_tokenizer_decode(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        self.assertListEqual(
-            processor.model_input_names,
-            feature_extractor.model_input_names,
-            msg="`processor` and `feature_extractor` model input names do not match",
-        )
diff --git a/tests/transformers/models/speecht5/test_tokenization_speecht5.py b/tests/transformers/models/speecht5/test_tokenization_speecht5.py
deleted file mode 100644
index eef6b15c2..000000000
--- a/tests/transformers/models/speecht5/test_tokenization_speecht5.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for the SpeechT5 tokenizers."""
-
-import unittest
-
-from mindnlp.transformers.models.speecht5 import SpeechT5Tokenizer
-from mindnlp.utils.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
-from mindnlp.transformers.tokenization_utils_base import AddedToken
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-SPIECE_UNDERLINE = "▁"
-SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe_char.model")
-
-
-@require_sentencepiece
-@require_tokenizers
-class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "microsoft/speecht5_asr"
-    tokenizer_class = SpeechT5Tokenizer
-    test_rust_tokenizer = False
-    test_sentencepiece = True
-
-    def setUp(self):
-        super().setUp()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = SpeechT5Tokenizer(SAMPLE_VOCAB)
-
-        mask_token = AddedToken("<mask>", lstrip=True, rstrip=False)
-        tokenizer.mask_token = mask_token
-        tokenizer.add_special_tokens({"mask_token": mask_token})
-        tokenizer.add_tokens(["<ctc_blank>"])
-
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "this is a test"
-        output_text = "this is a test"
-        return input_text, output_text
-
-    def get_numeric_input_output_texts(self):
-        input_text = "I have $123.45 and owe €59.78. My balance is -₴876.90 and have 73% stocks in my company which equals to ₦72649201"
-        output_text = "I have one hundred and twenty three point four five dollars and owe fifty nine point seven eight euros. My balance is minus eight hundred and seventy six point nine zero ukrainian hryvnia and have seventy three percent stocks in my company which equals to seventy two million six hundred and forty nine thousand two hundred and one nigerian naira"
-        return input_text, output_text
-
-    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5):
-        input_text, output_text = self.get_input_output_texts(tokenizer)
-        ids = tokenizer.encode(output_text, add_special_tokens=False)
-        text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
-        return text, ids
-
-    def test_tokenizer_normalization(self):
-        tokenizer = self.get_tokenizer(normalize=True)
-        input_text, expected_text = self.get_numeric_input_output_texts()
-        input_ids = tokenizer.encode(input_text)
-        output_text = tokenizer.decode(input_ids, skip_special_tokens=True)
-        self.assertEqual(output_text, expected_text)
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = "<pad>"
-        token_id = 1
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "<s>")
-        self.assertEqual(vocab_keys[1], "<pad>")
-        self.assertEqual(vocab_keys[-4], "œ")
-        self.assertEqual(vocab_keys[-2], "<mask>")
-        self.assertEqual(vocab_keys[-1], "<ctc_blank>")
-        self.assertEqual(len(vocab_keys), 81)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 79)
-
-    def test_add_tokens_tokenizer(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                vocab_size = tokenizer.vocab_size
-                all_size = len(tokenizer)
-
-                self.assertNotEqual(vocab_size, 0)
-
-                # We usually have added tokens from the start in tests because our vocab fixtures are
-                # smaller than the original vocabs - let's not assert this
-                # self.assertEqual(vocab_size, all_size)
-
-                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
-                added_toks = tokenizer.add_tokens(new_toks)
-                vocab_size_2 = tokenizer.vocab_size
-                all_size_2 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_2, 0)
-                self.assertEqual(vocab_size, vocab_size_2)
-                self.assertEqual(added_toks, len(new_toks))
-                self.assertEqual(all_size_2, all_size + len(new_toks))
-
-                tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
-
-                self.assertGreaterEqual(len(tokens), 4)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-3], tokenizer.vocab_size - 1)
-
-                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
-                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-                vocab_size_3 = tokenizer.vocab_size
-                all_size_3 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_3, 0)
-                self.assertEqual(vocab_size, vocab_size_3)
-                self.assertEqual(added_toks_2, len(new_toks_2))
-                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-
-                tokens = tokenizer.encode(
-                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
-                )
-
-                self.assertGreaterEqual(len(tokens), 6)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[0], tokens[1])
-                self.assertGreater(tokens[-3], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-3], tokens[-4])
-                self.assertEqual(tokens[0], tokenizer.eos_token_id)
-                self.assertEqual(tokens[-3], tokenizer.pad_token_id)
-
-    def test_pickle_subword_regularization_tokenizer(self):
-        pass
-
-    def test_subword_regularization_tokenizer(self):
-        pass
-
-    def test_full_tokenizer(self):
-        tokenizer = self.get_tokenizer(normalize=True)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE, 'T', 'h', 'i', 's', SPIECE_UNDERLINE, 'i', 's', SPIECE_UNDERLINE, 'a', SPIECE_UNDERLINE, 't', 'e', 's', 't'])  # fmt: skip
-
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens),
-            [4, 32, 11, 10, 12, 4, 10, 12, 4, 7, 4, 6, 5, 12, 6],
-        )
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens,[SPIECE_UNDERLINE, 'I', SPIECE_UNDERLINE, 'w', 'a', 's', SPIECE_UNDERLINE, 'b', 'o', 'r', 'n', SPIECE_UNDERLINE, 'i', 'n', SPIECE_UNDERLINE, 'n', 'i', 'n', 'e', 't', 'y', SPIECE_UNDERLINE, 't', 'w', 'o', SPIECE_UNDERLINE, 't', 'h', 'o', 'u', 's', 'a', 'n', 'd', ',', SPIECE_UNDERLINE, 'a', 'n', 'd', SPIECE_UNDERLINE, 't', 'h', 'i', 's', SPIECE_UNDERLINE, 'i', 's', SPIECE_UNDERLINE, 'f', 'a', 'l', 's', 'é', '.'])  # fmt: skip
-
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(ids, [4, 30, 4, 20, 7, 12, 4, 25, 8, 13, 9, 4, 10, 9, 4, 9, 10, 9, 5, 6, 22, 4, 6, 20, 8, 4, 6, 11, 8, 16, 12, 7, 9, 14, 23, 4, 7, 9, 14, 4, 6, 11, 10, 12, 4, 10, 12, 4, 19, 7, 15, 12, 73, 26])  # fmt: skip
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens,[SPIECE_UNDERLINE, 'I', SPIECE_UNDERLINE, 'w', 'a', 's', SPIECE_UNDERLINE, 'b', 'o', 'r', 'n', SPIECE_UNDERLINE, 'i', 'n', SPIECE_UNDERLINE, 'n', 'i', 'n', 'e', 't', 'y', SPIECE_UNDERLINE, 't', 'w', 'o', SPIECE_UNDERLINE, 't', 'h', 'o', 'u', 's', 'a', 'n', 'd', ',', SPIECE_UNDERLINE, 'a', 'n', 'd', SPIECE_UNDERLINE, 't', 'h', 'i', 's', SPIECE_UNDERLINE, 'i', 's', SPIECE_UNDERLINE, 'f', 'a', 'l', 's', 'é', '.'])  # fmt: skip
-
-    @slow
-    def test_tokenizer_integration(self):
-        # Use custom sequence because this tokenizer does not handle numbers.
-        sequences = [
-            "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
-            "general-purpose architectures (BERT, GPT, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
-            "Language Understanding (NLU) and Natural Language Generation (NLG) with over thirty-two pretrained "
-            "models in one hundred plus languages and deep interoperability between Jax, PyTorch and TensorFlow.",
-            "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
-            "conditioning on both left and right context in all layers.",
-            "The quick brown fox jumps over the lazy dog.",
-        ]
-
-        # fmt: off
-        expected_encoding = {
-            'input_ids': [
-                [4, 32, 13, 7, 9, 12, 19, 8, 13, 18, 5, 13, 12, 4, 64, 19, 8, 13, 18, 5, 13, 15, 22, 4, 28, 9, 8, 20, 9, 4, 7, 12, 4, 24, 22, 6, 8, 13, 17, 11, 39, 6, 13, 7, 9, 12, 19, 8, 13, 18, 5, 13, 12, 4, 7, 9, 14, 4, 24, 22, 6, 8, 13, 17, 11, 39, 24, 13, 5, 6, 13, 7, 10, 9, 5, 14, 39, 25, 5, 13, 6, 63, 4, 24, 13, 8, 27, 10, 14, 5, 12, 4, 21, 5, 9, 5, 13, 7, 15, 39, 24, 16, 13, 24, 8, 12, 5, 4, 7, 13, 17, 11, 10, 6, 5, 17, 6, 16, 13, 5, 12, 4, 64, 40, 47, 54, 32, 23, 4, 53, 49, 32, 23, 4, 54, 8, 40, 47, 54, 32, 7, 23, 4, 69, 52, 43, 23, 4, 51, 10, 12, 6, 10, 15, 40, 5, 13, 6, 23, 4, 69, 52, 48, 5, 6, 26, 26, 26, 63, 4, 19, 8, 13, 4, 48, 7, 6, 16, 13, 7, 15, 4, 52, 7, 9, 21, 16, 7, 21, 5, 4, 61, 9, 14, 5, 13, 12, 6, 7, 9, 14, 10, 9, 21, 4, 64, 48, 52, 61, 63, 4, 7, 9, 14, 4, 48, 7, 6, 16, 13, 7, 15, 4, 52, 7, 9, 21, 16, 7, 21, 5, 4, 53, 5, 9, 5, 13, 7, 6, 10, 8, 9, 4, 64, 48, 52, 53, 63, 4, 20, 10, 6, 11, 4, 8, 27, 5, 13, 4, 6, 11, 10, 13, 6, 22, 39, 6, 20, 8, 4, 24, 13, 5, 6, 13, 7, 10, 9, 5, 14, 4, 18, 8, 14, 5, 15, 12, 4, 10, 9, 4, 8, 9, 5, 4, 11, 16, 9, 14, 13, 5, 14, 4, 24, 15, 16, 12, 4, 15, 7, 9, 21, 16, 7, 21, 5, 12, 4, 7, 9, 14, 4, 14, 5, 5, 24, 4, 10, 9, 6, 5, 13, 8, 24, 5, 13, 7, 25, 10, 15, 10, 6, 22, 4, 25, 5, 6, 20, 5, 5, 9, 4, 58, 7, 37, 23, 4, 49, 22, 32, 8, 13, 17, 11, 4, 7, 9, 14, 4, 32, 5, 9, 12, 8, 13, 55, 15, 8, 20, 26, 2],
-                [4, 40, 47, 54, 32, 4, 10, 12, 4, 14, 5, 12, 10, 21, 9, 5, 14, 4, 6, 8, 4, 24, 13, 5, 39, 6, 13, 7, 10, 9, 4, 14, 5, 5, 24, 4, 25, 10, 14, 10, 13, 5, 17, 6, 10, 8, 9, 7, 15, 4, 13, 5, 24, 13, 5, 12, 5, 9, 6, 7, 6, 10, 8, 9, 12, 4, 19, 13, 8, 18, 4, 16, 9, 15, 7, 25, 5, 15, 5, 14, 4, 6, 5, 37, 6, 4, 25, 22, 4, 46, 8, 10, 9, 6, 15, 22, 4, 17, 8, 9, 14, 10, 6, 10, 8, 9, 10, 9, 21, 4, 8, 9, 4, 25, 8, 6, 11, 4, 15, 5, 19, 6, 4, 7, 9, 14, 4, 13, 10, 21, 11, 6, 4, 17, 8, 9, 6, 5, 37, 6, 4, 10, 9, 4, 7, 15, 15, 4, 15, 7, 22, 5, 13, 12, 26, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                [4, 32, 11, 5, 4, 45, 16, 10, 17, 28, 4, 25, 13, 8, 20, 9, 4, 19, 8, 37, 4, 46, 16, 18, 24, 12, 4, 8, 27, 5, 13, 4, 6, 11, 5, 4, 15, 7, 57, 22, 4, 14, 8, 21, 26, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-            ],
-            'attention_mask': [
-                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-            ]
-        }
-        # fmt: on
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="microsoft/speecht5_asr",
-            revision="c5ef64c71905caeccde0e4462ef3f9077224c524",
-            sequences=sequences,
-        )
-
-    def test_encode_decode(self):
-        tokenizer = SpeechT5Tokenizer.from_pretrained("microsoft/speecht5_tts")
-
-        tokens = tokenizer.tokenize("a = b")
-        self.assertEqual(tokens, ["▁", "a", "▁", "=", "▁", "b"])
-
-        # the `'='` is unknown.
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertEqual(ids, [4, 7, 4, 3, 4, 25])
-
-        # let's make sure decoding with the special unknown tokens preserves spaces
-        ids = tokenizer.encode("a = b")
-        self.assertEqual(tokenizer.decode(ids), "a <unk> b</s>")
-
-    @unittest.skip(reason="no pretrained tokenizer for Speecht5 model")
-    def test_pretrained_model_lists(self):
-        pass
diff --git a/tests/transformers/models/splinter/__init__.py b/tests/transformers/models/splinter/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/splinter/test_modeling_splinter.py b/tests/transformers/models/splinter/test_modeling_splinter.py
deleted file mode 100644
index af2ce3622..000000000
--- a/tests/transformers/models/splinter/test_modeling_splinter.py
+++ /dev/null
@@ -1,653 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Testing suite for the MindSpore Splinter model."""
-
-import copy
-import unittest
-
-from mindspore import ops
-
-from mindnlp.utils.import_utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import (
-        SplinterConfig,
-        SplinterForPreTraining,
-        SplinterForQuestionAnswering,
-        SplinterModel,
-    )
-
-
-class SplinterModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        num_questions=3,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        question_token_id=1,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_questions = num_questions
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.question_token_id = question_token_id
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids[:, 1] = self.question_token_id
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor(
-                [self.batch_size, self.seq_length], self.type_vocab_size
-            )
-
-        start_positions = None
-        end_positions = None
-        question_positions = None
-        if self.use_labels:
-            start_positions = ids_tensor(
-                [self.batch_size, self.num_questions], self.type_sequence_label_size
-            )
-            end_positions = ids_tensor(
-                [self.batch_size, self.num_questions], self.type_sequence_label_size
-            )
-            question_positions = ids_tensor(
-                [self.batch_size, self.num_questions], self.num_labels
-            )
-
-        config = SplinterConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            question_token_id=self.question_token_id,
-        )
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            start_positions,
-            end_positions,
-            question_positions,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        start_positions,
-        end_positions,
-        question_positions,
-    ):
-        model = SplinterModel(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids, attention_mask=input_mask, token_type_ids=token_type_ids
-        )
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-
-    def create_and_check_for_question_answering(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        start_positions,
-        end_positions,
-        question_positions,
-    ):
-        model = SplinterForQuestionAnswering(config=config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=start_positions[:, 0],
-            end_positions=end_positions[:, 0],
-        )
-        self.parent.assertEqual(
-            result.start_logits.shape, (self.batch_size, self.seq_length)
-        )
-        self.parent.assertEqual(
-            result.end_logits.shape, (self.batch_size, self.seq_length)
-        )
-
-    def create_and_check_for_pretraining(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        start_positions,
-        end_positions,
-        question_positions,
-    ):
-        model = SplinterForPreTraining(config=config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=start_positions,
-            end_positions=end_positions,
-            question_positions=question_positions,
-        )
-        self.parent.assertEqual(
-            result.start_logits.shape,
-            (self.batch_size, self.num_questions, self.seq_length),
-        )
-        self.parent.assertEqual(
-            result.end_logits.shape,
-            (self.batch_size, self.num_questions, self.seq_length),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            start_positions,
-            end_positions,
-            question_positions,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class SplinterModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            SplinterModel,
-            SplinterForQuestionAnswering,
-            SplinterForPreTraining,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": SplinterModel,
-            "question-answering": SplinterForQuestionAnswering,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    # TODO: Fix the failed tests when this model gets more usage
-    def is_pipeline_test_to_skip(
-        self,
-        pipeline_test_casse_name,
-        config_class,
-        model_architecture,
-        tokenizer_name,
-        processor_name,
-    ):
-        if pipeline_test_casse_name == "QAPipelineTests":
-            return True
-        elif (
-            pipeline_test_casse_name == "FeatureExtractionPipelineTests"
-            and tokenizer_name.endswith("Fast")
-        ):
-            return True
-
-        return False
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-        if return_labels:
-            if issubclass(model_class, SplinterForPreTraining):
-                inputs_dict["start_positions"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.num_questions),
-                    dtype=mindspore.int32,
-                )
-                inputs_dict["end_positions"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.num_questions),
-                    dtype=mindspore.int32,
-                )
-                inputs_dict["question_positions"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.num_questions),
-                    dtype=mindspore.int32,
-                )
-            elif issubclass(model_class, SplinterForQuestionAnswering):
-                inputs_dict["start_positions"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int32
-                )
-                inputs_dict["end_positions"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int32
-                )
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = SplinterModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=SplinterConfig, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            model.set_train(False)
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            if isinstance(model, SplinterForPreTraining):
-                with self.assertRaises(TypeError):
-                    # question_positions must not be None.
-                    _ = model(**inputs)[0]
-            else:
-                _ = model(**inputs)[0]
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "tau/splinter-base"
-        model = SplinterModel.from_pretrained(model_name, from_pt=True)
-        self.assertIsNotNone(model)
-
-    # overwrite from common since `SplinterForPreTraining` could contain different number of question tokens in inputs.
-    # When the batch is distributed to multiple devices, each replica could get different values for the maximal number
-    # of question tokens (see `SplinterForPreTraining._prepare_question_positions()`), and the model returns different
-    # shape along dimension 1 (i.e. `num_questions`) that could not be combined into a single tensor as an output.
-
-
-@require_mindspore
-class SplinterModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_splinter_question_answering(self):
-        model = SplinterForQuestionAnswering.from_pretrained(
-            "tau/splinter-base-qass", from_pt=True
-        )
-
-        # Input: "[CLS] Brad was born in [QUESTION] . He returned to the United Kingdom later . [SEP]"
-        # Output should be the span "the United Kingdom"
-        input_ids = mindspore.tensor(
-            [
-                [
-                    101,
-                    7796,
-                    1108,
-                    1255,
-                    1107,
-                    104,
-                    119,
-                    1124,
-                    1608,
-                    1106,
-                    1103,
-                    1244,
-                    2325,
-                    1224,
-                    119,
-                    102,
-                ]
-            ]
-        )
-        output = model(input_ids)
-
-        expected_shape = tuple((1, 16))
-        self.assertEqual(output.start_logits.shape, expected_shape)
-        self.assertEqual(output.end_logits.shape, expected_shape)
-
-        self.assertEqual(ops.argmax(output.start_logits), 10)
-        self.assertEqual(ops.argmax(output.end_logits), 12)
-
-    @slow
-    def test_splinter_pretraining(self):
-        model = SplinterForPreTraining.from_pretrained(
-            "tau/splinter-base-qass", from_pt=True
-        )
-
-        # Input: "[CLS] [QUESTION] was born in [QUESTION] . Brad returned to the United Kingdom later . [SEP]"
-        # Output should be the spans "Brad" and "the United Kingdom"
-        input_ids = mindspore.tensor(
-            [
-                [
-                    101,
-                    104,
-                    1108,
-                    1255,
-                    1107,
-                    104,
-                    119,
-                    7796,
-                    1608,
-                    1106,
-                    1103,
-                    1244,
-                    2325,
-                    1224,
-                    119,
-                    102,
-                ]
-            ]
-        )
-        question_positions = mindspore.tensor([[1, 5]], dtype=mindspore.int32)
-        output = model(input_ids, question_positions=question_positions)
-
-        expected_shape = tuple((1, 2, 16))
-        self.assertEqual(output.start_logits.shape, expected_shape)
-        self.assertEqual(output.end_logits.shape, expected_shape)
-
-        self.assertEqual(ops.argmax(output.start_logits[0, 0]), 7)
-        self.assertEqual(ops.argmax(output.end_logits[0, 0]), 7)
-        self.assertEqual(ops.argmax(output.start_logits[0, 1]), 10)
-        self.assertEqual(ops.argmax(output.end_logits[0, 1]), 12)
-
-    @slow
-    def test_splinter_pretraining_loss_requires_question_positions(self):
-        model = SplinterForPreTraining.from_pretrained(
-            "tau/splinter-base-qass", from_pt=True
-        )
-
-        # Input: "[CLS] [QUESTION] was born in [QUESTION] . Brad returned to the United Kingdom later . [SEP]"
-        # Output should be the spans "Brad" and "the United Kingdom"
-        input_ids = mindspore.tensor(
-            [
-                [
-                    101,
-                    104,
-                    1108,
-                    1255,
-                    1107,
-                    104,
-                    119,
-                    7796,
-                    1608,
-                    1106,
-                    1103,
-                    1244,
-                    2325,
-                    1224,
-                    119,
-                    102,
-                ]
-            ]
-        )
-        start_positions = mindspore.tensor([[7, 10]], dtype=mindspore.int32)
-        end_positions = mindspore.tensor([7, 12], dtype=mindspore.int32)
-        with self.assertRaises(TypeError):
-            model(
-                input_ids,
-                start_positions=start_positions,
-                end_positions=end_positions,
-            )
-
-    @slow
-    def test_splinter_pretraining_loss(self):
-        model = SplinterForPreTraining.from_pretrained(
-            "tau/splinter-base-qass", from_pt=True
-        )
-
-        # Input: "[CLS] [QUESTION] was born in [QUESTION] . Brad returned to the United Kingdom later . [SEP]"
-        # Output should be the spans "Brad" and "the United Kingdom"
-        input_ids = mindspore.tensor(
-            [
-                [
-                    101,
-                    104,
-                    1108,
-                    1255,
-                    1107,
-                    104,
-                    119,
-                    7796,
-                    1608,
-                    1106,
-                    1103,
-                    1244,
-                    2325,
-                    1224,
-                    119,
-                    102,
-                ],
-                [
-                    101,
-                    104,
-                    1108,
-                    1255,
-                    1107,
-                    104,
-                    119,
-                    7796,
-                    1608,
-                    1106,
-                    1103,
-                    1244,
-                    2325,
-                    1224,
-                    119,
-                    102,
-                ],
-            ]
-        )
-        start_positions = mindspore.tensor([[7, 10], [7, 10]], dtype=mindspore.int32)
-        end_positions = mindspore.tensor([[7, 12], [7, 12]], dtype=mindspore.int32)
-        question_positions = mindspore.tensor([[1, 5], [1, 5]], dtype=mindspore.int32)
-        output = model(
-            input_ids,
-            start_positions=start_positions,
-            end_positions=end_positions,
-            question_positions=question_positions,
-        )
-        self.assertAlmostEqual(output.loss.item(), 0.0024, 4)
-
-    @slow
-    def test_splinter_pretraining_loss_with_padding(self):
-        model = SplinterForPreTraining.from_pretrained(
-            "tau/splinter-base-qass", from_pt=True
-        )
-
-        # Input: "[CLS] [QUESTION] was born in [QUESTION] . Brad returned to the United Kingdom later . [SEP]"
-        # Output should be the spans "Brad" and "the United Kingdom"
-        input_ids = mindspore.tensor(
-            [
-                [
-                    101,
-                    104,
-                    1108,
-                    1255,
-                    1107,
-                    104,
-                    119,
-                    7796,
-                    1608,
-                    1106,
-                    1103,
-                    1244,
-                    2325,
-                    1224,
-                    119,
-                    102,
-                ],
-            ]
-        )
-        start_positions = mindspore.tensor([[7, 10]], dtype=mindspore.int32)
-        end_positions = mindspore.tensor([7, 12], dtype=mindspore.int32)
-        question_positions = mindspore.tensor([[1, 5]], dtype=mindspore.int32)
-        start_positions_with_padding = mindspore.tensor(
-            [[7, 10, 0]], dtype=mindspore.int32
-        )
-        end_positions_with_padding = mindspore.tensor([7, 12, 0], dtype=mindspore.int32)
-        question_positions_with_padding = mindspore.tensor(
-            [[1, 5, 0]], dtype=mindspore.int32
-        )
-        output = model(
-            input_ids,
-            start_positions=start_positions,
-            end_positions=end_positions,
-            question_positions=question_positions,
-        )
-        output_with_padding = model(
-            input_ids,
-            start_positions=start_positions_with_padding,
-            end_positions=end_positions_with_padding,
-            question_positions=question_positions_with_padding,
-        )
-
-        self.assertAlmostEqual(output.loss.item(), output_with_padding.loss.item(), 4)
-
-        # Note that the original code uses 0 to denote padded question tokens
-        # and their start and end positions. As the pad_token_id of the model's
-        # config is used for the losse's ignore_index in SplinterForPreTraining,
-        # we add this test to ensure anybody making changes to the default
-        # value of the config, will be aware of the implication.
-        self.assertEqual(model.config.pad_token_id, 0)
-
-    @slow
-    def test_splinter_pretraining_prepare_question_positions(self):
-        model = SplinterForPreTraining.from_pretrained(
-            "tau/splinter-base-qass", from_pt=True
-        )
-
-        input_ids = mindspore.tensor(
-            [
-                [101, 104, 1, 2, 104, 3, 4, 102],
-                [101, 1, 104, 2, 104, 3, 104, 102],
-                [101, 1, 2, 104, 104, 3, 4, 102],
-                [101, 1, 2, 3, 4, 5, 104, 102],
-            ]
-        )
-        question_positions = mindspore.tensor(
-            [[1, 4, 0], [2, 4, 6], [3, 4, 0], [6, 0, 0]], dtype=mindspore.int32
-        )
-        output_without_positions = model(input_ids)
-        output_with_positions = model(input_ids, question_positions=question_positions)
-        self.assertTrue(
-            (
-                output_without_positions.start_logits
-                == output_with_positions.start_logits
-            ).all()
-        )
-        self.assertTrue(
-            (
-                output_without_positions.end_logits == output_with_positions.end_logits
-            ).all()
-        )
diff --git a/tests/transformers/models/squeezebert/__init__.py b/tests/transformers/models/squeezebert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/squeezebert/test_modeling_squeezebert.py b/tests/transformers/models/squeezebert/test_modeling_squeezebert.py
deleted file mode 100644
index 3a792d7d1..000000000
--- a/tests/transformers/models/squeezebert/test_modeling_squeezebert.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-from mindspore import ops
-
-from mindnlp.transformers import SqueezeBertConfig
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    require_mindspore,
-    require_sentencepiece,
-    require_tokenizers,
-    slow,
-)
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import (
-        SqueezeBertForMaskedLM,
-        SqueezeBertForMultipleChoice,
-        SqueezeBertForQuestionAnswering,
-        SqueezeBertForSequenceClassification,
-        SqueezeBertForTokenClassification,
-        SqueezeBertModel,
-    )
-
-
-class SqueezeBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=64,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-        q_groups=2,
-        k_groups=2,
-        v_groups=2,
-        post_attention_groups=2,
-        intermediate_groups=4,
-        output_groups=1,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-        self.q_groups = q_groups
-        self.k_groups = k_groups
-        self.v_groups = v_groups
-        self.post_attention_groups = post_attention_groups
-        self.intermediate_groups = intermediate_groups
-        self.output_groups = output_groups
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor(
-                [self.batch_size], self.type_sequence_label_size
-            )
-            token_labels = ids_tensor(
-                [self.batch_size, self.seq_length], self.num_labels
-            )
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        )
-
-    def get_config(self):
-        return SqueezeBertConfig(
-            embedding_size=self.hidden_size,
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            attention_probs_dropout_prob=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            q_groups=self.q_groups,
-            k_groups=self.k_groups,
-            v_groups=self.v_groups,
-            post_attention_groups=self.post_attention_groups,
-            intermediate_groups=self.intermediate_groups,
-            output_groups=self.output_groups,
-        )
-
-    def create_and_check_squeezebert_model(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = SqueezeBertModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length, self.hidden_size),
-        )
-
-    def create_and_check_squeezebert_for_masked_lm(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = SqueezeBertForMaskedLM(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)
-        )
-
-    def create_and_check_squeezebert_for_question_answering(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = SqueezeBertForQuestionAnswering(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(
-            result.start_logits.shape, (self.batch_size, self.seq_length)
-        )
-        self.parent.assertEqual(
-            result.end_logits.shape, (self.batch_size, self.seq_length)
-        )
-
-    def create_and_check_squeezebert_for_sequence_classification(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = SqueezeBertForSequenceClassification(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_squeezebert_for_token_classification(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = SqueezeBertForTokenClassification(config=config)
-        model.set_train(False)
-
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)
-        )
-
-    def create_and_check_squeezebert_for_multiple_choice(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        config.num_choices = self.num_choices
-        model = SqueezeBertForMultipleChoice(config=config)
-        model.set_train(False)
-        multiple_choice_inputs_ids = (
-            input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        )
-        multiple_choice_input_mask = (
-            input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        )
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_choices)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class SqueezeBertModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            SqueezeBertForMaskedLM,
-            SqueezeBertForMultipleChoice,
-            SqueezeBertForQuestionAnswering,
-            SqueezeBertForSequenceClassification,
-            SqueezeBertForTokenClassification,
-        )
-        if is_mindspore_available()
-        else None
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": SqueezeBertModel,
-            "fill-mask": SqueezeBertForMaskedLM,
-            "question-answering": SqueezeBertForQuestionAnswering,
-            "text-classification": SqueezeBertForSequenceClassification,
-            "token-classification": SqueezeBertForTokenClassification,
-            "zero-shot": SqueezeBertForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_resize_embeddings = True
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = SqueezeBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=SqueezeBertConfig, dim=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_squeezebert_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_squeezebert_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_squeezebert_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_squeezebert_for_question_answering(
-            *config_and_inputs
-        )
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_squeezebert_for_sequence_classification(
-            *config_and_inputs
-        )
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_squeezebert_for_token_classification(
-            *config_and_inputs
-        )
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_squeezebert_for_multiple_choice(
-            *config_and_inputs
-        )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "squeezebert/squeezebert-uncased"
-        model = SqueezeBertModel.from_pretrained(model_name, from_pt=True)
-        self.assertIsNotNone(model)
-
-
-@require_sentencepiece
-@require_tokenizers
-@require_mindspore
-class SqueezeBertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_classification_head(self):
-        model = SqueezeBertForSequenceClassification.from_pretrained(
-            "squeezebert/squeezebert-mnli", from_pt=True
-        )
-
-        input_ids = mindspore.tensor(
-            [[1, 29414, 232, 328, 740, 1140, 12695, 69, 13, 1588, 2]]
-        )
-        output = model(input_ids)[0]
-        expected_shape = (1, 3)
-        self.assertEqual(output.shape, expected_shape)
-        expected_tensor = np.array([[0.6401, -0.0349, -0.6041]])
-        self.assertTrue(np.allclose(output.asnumpy(), expected_tensor, atol=1e-4))
diff --git a/tests/transformers/models/stablelm/__init__.py b/tests/transformers/models/stablelm/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/stablelm/test_modeling_stablelm.py b/tests/transformers/models/stablelm/test_modeling_stablelm.py
deleted file mode 100644
index 57e7ff3c0..000000000
--- a/tests/transformers/models/stablelm/test_modeling_stablelm.py
+++ /dev/null
@@ -1,548 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch StableLm model."""
-
-import unittest
-import pytest
-
-from parameterized import parameterized
-import numpy as np
-from mindnlp.transformers import StableLmConfig
-from mindnlp.utils import is_mindspore_available
-from mindnlp.engine import set_seed
-from mindnlp.utils.testing_utils import (
-    is_flaky,
-    require_mindspore,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        AutoTokenizer,
-        StableLmForCausalLM,
-        StableLmForSequenceClassification,
-        StableLmForTokenClassification,
-        StableLmModel,
-    )
-    from mindnlp.transformers.models.stablelm.modeling_stablelm import (
-        StableLmDynamicNTKScalingRotaryEmbedding,
-        StableLmLinearScalingRotaryEmbedding,
-        StableLmRotaryEmbedding,
-    )
-
-
-# Copied from transformers.tests.models.persimmon.test_modeling_persimmon.PersimmonModelTester with Persimmon -> StableLm
-class StableLmModelTester:
-    # Ignore copy
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=64,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        num_key_value_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ops.tril(ops.ones(self.batch_size, self.seq_length))
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return StableLmConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_key_value_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = StableLmModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = StableLmModel(config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = StableLmForCausalLM(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = StableLmForCausalLM(config=config)
-        model.set_train(False)
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-# Copied from transformers.tests.persimmon.test_modeling_persimmon.PersimmonModelTest with Persimmon -> StableLm
-class StableLmModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (StableLmModel, StableLmForCausalLM, StableLmForSequenceClassification, StableLmForTokenClassification)
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": StableLmModel,
-            "text-classification": StableLmForSequenceClassification,
-            "token-classification": StableLmForTokenClassification,
-            # TODO (ydshieh): check why these two fail. Fix them or skip them in a better way.
-            # "text-generation": StableLmForCausalLM,
-            # "zero-shot": StableLmForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    all_generative_model_classes = (StableLmForCausalLM,) if is_mindspore_available() else ()
-    test_headmasking = False
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = StableLmModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=StableLmConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_stablelm_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = StableLmForSequenceClassification(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_stablelm_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = StableLmForSequenceClassification(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_stablelm_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(mindspore.float32)
-        model = StableLmForSequenceClassification(config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->StableLm,llama->stablelm
-    def test_stablelm_token_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
-        model = StableLmForTokenClassification(config=config)
-        model
-        model.set_train(False)
-        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
-        self.assertEqual(
-            result.logits.shape,
-            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
-        )
-
-    @parameterized.expand([("linear",), ("dynamic",)])
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model_rope_scaling_from_config with Llama->StableLm
-    def test_model_rope_scaling_from_config(self, scaling_type):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        short_input = ids_tensor([1, 10], config.vocab_size)
-        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        original_model = StableLmModel(config)
-        original_model.set_train(False)
-        original_short_output = original_model(short_input).last_hidden_state
-        original_long_output = original_model(long_input).last_hidden_state
-
-        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
-        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
-        scaled_model = StableLmModel(config)
-        scaled_model.set_train(False)
-        scaled_short_output = scaled_model(short_input).last_hidden_state
-        scaled_long_output = scaled_model(long_input).last_hidden_state
-
-        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
-        # maximum sequence length, so the outputs for the short input should match.
-        if scaling_type == "dynamic":
-            self.assertTrue(np.allclose(original_short_output.asnumpy(), scaled_short_output.asnumpy(), atol=1e-5))
-        else:
-            self.assertFalse(np.allclose(original_short_output.asnumpy(), scaled_short_output.asnumpy(), atol=1e-5))
-
-        # The output should be different for long inputs
-        self.assertFalse(np.allclose(original_long_output.asnumpy(), scaled_long_output.asnumpy(), atol=1e-5))
-
-    # Copied from tests.models.falcon.test_modeling_falcon.FalconModelTest.test_model_rope_scaling with Falcon->StableLm
-    def test_model_rope_scaling(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        hidden_size = config.hidden_size
-        num_heads = config.num_attention_heads
-        head_dim = hidden_size // num_heads
-        scaling_factor = 10
-        short_input_length = 10
-        long_input_length = int(config.max_position_embeddings * 1.5)
-
-        # Inputs
-        x = ops.randn(1, dtype=mindspore.float32)  # used exlusively to get the dtype and the device
-
-        # Sanity check original RoPE
-        original_rope = StableLmRotaryEmbedding(
-            head_dim,
-            max_position_embeddings=config.max_position_embeddings,
-            base=config.rope_theta,
-        )
-        original_cos_short, original_sin_short = original_rope(x, short_input_length)
-        original_cos_long, original_sin_long = original_rope(x, long_input_length)
-        self.assertTrue(np.allclose(original_cos_short.asnumpy(), original_cos_long[:short_input_length, :].asnumpy()))
-        self.assertTrue(np.allclose(original_sin_short.asnumpy(), original_sin_long[:short_input_length, :].asnumpy()))
-
-        # Sanity check linear RoPE scaling
-        # New position "x" should match original position with index "x/scaling_factor"
-        linear_scaling_rope = StableLmLinearScalingRotaryEmbedding(
-            head_dim,
-            max_position_embeddings=config.max_position_embeddings,
-            base=config.rope_theta,
-            scaling_factor=scaling_factor,
-        )
-        linear_cos_short, linear_sin_short = linear_scaling_rope(x, short_input_length)
-        linear_cos_long, linear_sin_long = linear_scaling_rope(x, long_input_length)
-        self.assertTrue(np.allclose(linear_cos_short.asnumpy(), linear_cos_long[:short_input_length, :].asnumpy()))
-        self.assertTrue(np.allclose(linear_sin_short.asnumpy(), linear_sin_long[:short_input_length, :].asnumpy()))
-
-        for new_position in range(0, long_input_length, scaling_factor):
-            original_position = int(new_position // scaling_factor)
-            self.assertTrue(np.allclose(linear_cos_long[new_position, :].asnumpy(), original_cos_long[original_position, :].asnumpy()))
-            self.assertTrue(np.allclose(linear_sin_long[new_position, :].asnumpy(), original_sin_long[original_position, :].asnumpy()))
-
-        # Sanity check Dynamic NTK RoPE scaling
-        # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase
-        # with scaling_factor (or that `inv_freq` decreases)
-        ntk_scaling_rope = StableLmDynamicNTKScalingRotaryEmbedding(
-            head_dim,
-            max_position_embeddings=config.max_position_embeddings,
-            base=config.rope_theta,
-            scaling_factor=scaling_factor,
-        )
-        ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, short_input_length)
-        ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, long_input_length)
-        self.assertTrue(np.allclose(ntk_cos_short.asnumpy(), original_cos_short.asnumpy()))
-        self.assertTrue(np.allclose(ntk_sin_short.asnumpy(), original_sin_short.asnumpy()))
-        with self.assertRaises(AssertionError):
-            self.assertTrue(np.allclose(ntk_cos_long.asnumpy(), original_cos_long.asnumpy()))
-        with self.assertRaises(AssertionError):
-            self.assertTrue(np.allclose(ntk_sin_long.asnumpy(), original_sin_long.asnumpy()))
-        self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
-
-
-@require_mindspore
-class StableLmModelIntegrationTest(unittest.TestCase):
-    @pytest.mark.skip
-    @slow
-    def test_model_stablelm_3b_4e1t_logits(self):
-        input_ids = {"input_ids": mindspore.tensor([[510, 8588, 310, 1900, 9386]], dtype=mindspore.int64)}
-
-        model = StableLmForCausalLM.from_pretrained("stabilityai/stablelm-3b-4e1t")
-        model.set_train(False)
-
-        output = model(**input_ids).logits
-
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = mindspore.tensor([[2.7146, 2.4245, 1.5616, 1.4424, 2.6790]])
-        self.assertTrue(np.allclose(output.mean(axis=-1).asnumpy(), EXPECTED_MEAN.asnumpy(), atol=1e-4, rtol=1e-4))
-
-        # Expected logits sliced from [0, 0, 0:30]
-        EXPECTED_SLICE = mindspore.tensor([7.1030, -1.4195,  9.9206,  7.7008,  4.9891,  4.2169,  5.5426,  3.7878, 6.7593,  5.7360,  8.4691,  5.5448,  5.0544, 10.4129,  8.5573, 13.0405, 7.3265,  3.5868,  6.1106,  5.9406,  5.6376,  5.7490,  5.4850,  4.8124, 5.1991,  4.6419,  4.5719,  9.9588,  6.7222,  4.5070])  # fmt: skip
-        self.assertTrue(np.allclose(output[0, 0, :30].asnumpy(), EXPECTED_SLICE.asnumpy(), atol=1e-4, rtol=1e-4))
-
-    @slow
-    def test_model_stablelm_3b_4e1t_generation(self):
-        tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-3b-4e1t")
-        model = StableLmForCausalLM.from_pretrained("stabilityai/stablelm-3b-4e1t")
-        input_ids = tokenizer.encode(
-            "My favorite food has always been pizza, but lately",
-            return_tensors="ms",
-        )
-
-        outputs = model.generate(input_ids, max_new_tokens=20, temperature=0)
-        text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-
-        EXPECTED_TEXT_COMPLETION = """My favorite food has always been pizza, but lately I’ve been craving something different. I’ve been trying to eat healthier and I’ve"""
-        self.assertEqual(text, EXPECTED_TEXT_COMPLETION)
-
-    @pytest.mark.skip
-    @slow
-    def test_model_tiny_random_stablelm_2_logits(self):
-        # Check parallel residual and qk layernorm forward pass
-        input_ids = {"input_ids": mindspore.tensor([[510, 8588, 310, 1900, 9386]], dtype=mindspore.int64)}
-
-        model = StableLmForCausalLM.from_pretrained("stabilityai/tiny-random-stablelm-2")
-        model.set_train(False)
-
-        output = model(**input_ids).logits
-
-        # Expected mean on dim = -1
-        EXPECTED_MEAN = mindspore.tensor([[-2.7196, -3.6099, -2.6877, -3.1973, -3.9344]])
-        self.assertTrue(np.allclose(output.mean(axis=-1).asnumpy(), EXPECTED_MEAN.asnumpy(), atol=1e-4, rtol=1e-4))
-
-        # Expected logits sliced from [0, 0, 0:30]
-        EXPECTED_SLICE = mindspore.tensor([2.8364, 5.3811, 5.1659, 7.5485, 4.3219, 6.3315, 1.3967, 6.9147, 3.9679, 6.4786, 5.9176, 3.3067, 5.2917, 0.1485, 3.9630, 7.9947,10.6727, 9.6757, 8.8772, 8.3527, 7.8445, 6.6025, 5.5786, 7.0985,6.1369, 3.4259, 1.9397, 4.6157, 4.8105, 3.1768])  # fmt: skip
-        self.assertTrue(np.allclose(output[0, 0, :30].asnumpy(), EXPECTED_SLICE.asnumpy(), atol=1e-4, rtol=1e-4))
-
-    @slow
-    def test_model_tiny_random_stablelm_2_generation(self):
-        # Check parallel residual and qk layernorm generation
-        tokenizer = AutoTokenizer.from_pretrained("stabilityai/tiny-random-stablelm-2")
-        model = StableLmForCausalLM.from_pretrained("stabilityai/tiny-random-stablelm-2")
-        input_ids = tokenizer.encode(
-            "My favorite ride at the amusement park",
-            return_tensors="ms",
-        )
-
-        outputs = model.generate(input_ids, max_new_tokens=20, temperature=0)
-        text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-
-        EXPECTED_TEXT_COMPLETION = """My favorite ride at the amusement park is the 2000-mile roller coaster. It's a thrilling ride filled with roller coast"""
-        self.assertEqual(text, EXPECTED_TEXT_COMPLETION)
-
-    # @require_bitsandbytes
-    # @slow
-    # @require_flash_attn
-    # def test_model_3b_long_prompt(self):
-    #     EXPECTED_OUTPUT_TOKEN_IDS = [3, 3, 3]
-    #     input_ids = [306, 338] * 2047
-    #     model = StableLmForCausalLM.from_pretrained(
-    #         "stabilityai/stablelm-3b-4e1t",
-    #         device_map="auto",
-    #         torch_dtype="auto",
-    #         load_in_4bit=True,
-    #         attn_implementation="flash_attention_2",
-    #     )
-    #     input_ids = mindspore.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
-    #     generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
-    #     self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-3:].tolist())
diff --git a/tests/transformers/models/starcoder2/__init__.py b/tests/transformers/models/starcoder2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/starcoder2/test_modeling_starcoder2.py b/tests/transformers/models/starcoder2/test_modeling_starcoder2.py
deleted file mode 100644
index 0860c5648..000000000
--- a/tests/transformers/models/starcoder2/test_modeling_starcoder2.py
+++ /dev/null
@@ -1,474 +0,0 @@
-# coding=utf-8
-# Copyright 2024 BigCode and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Starcoder2 model."""
-
-import tempfile
-import unittest
-
-import pytest
-
-from mindnlp.transformers import Starcoder2Config
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    require_mindspore,
-    require_mindspore_gpu,
-    slow,
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import (
-        AutoTokenizer,
-        Starcoder2ForCausalLM,
-        Starcoder2ForSequenceClassification,
-        Starcoder2ForTokenClassification,
-        Starcoder2Model,
-    )
-
-
-# Copied from transformers.tests.models.mistral.test_modeling_mistral.Starcoder2ModelTester with Mistral->Starcoder2
-class Starcoder2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=False,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        num_key_value_heads=2,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.pad_token_id = pad_token_id
-        self.scope = scope
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = ops.tril(ops.ones(self.batch_size, self.seq_length))
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    # Ignore copy
-    def get_config(self):
-        return Starcoder2Config(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            num_key_value_heads=self.num_key_value_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            pad_token_id=self.pad_token_id,
-            eos_token_id=self.pad_token_id,
-            bos_token_id=self.pad_token_id,
-        )
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Starcoder2
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = Starcoder2Model(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Starcoder2
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Starcoder2Model(config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Starcoder2
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Starcoder2ForCausalLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Starcoder2
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Starcoder2ForCausalLM(config=config)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-# Copied from transformers.tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Starcoder2
-class Starcoder2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (Starcoder2Model, Starcoder2ForCausalLM, Starcoder2ForSequenceClassification, Starcoder2ForTokenClassification)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (Starcoder2ForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": Starcoder2Model,
-            "text-classification": Starcoder2ForSequenceClassification,
-            "token-classification": Starcoder2ForTokenClassification,
-            "text-generation": Starcoder2ForCausalLM,
-            "zero-shot": Starcoder2ForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_headmasking = False
-    test_pruning = False
-
-    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return True
-
-    def setUp(self):
-        self.model_tester = Starcoder2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Starcoder2Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_Starcoder2_sequence_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        print(config)
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = Starcoder2ForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Starcoder2_sequence_classification_model_for_single_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "single_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
-        model = Starcoder2ForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    def test_Starcoder2_sequence_classification_model_for_multi_label(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        config.problem_type = "multi_label_classification"
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        sequence_labels = ids_tensor(
-            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
-        ).to(mindspore.float32)
-        model = Starcoder2ForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
-        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->Starcoder2,llama->Starcoder2
-    def test_Starcoder2_token_classification_model(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.num_labels = 3
-        input_ids = input_dict["input_ids"]
-        attention_mask = input_ids.ne(1)
-        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
-        model = Starcoder2ForTokenClassification(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
-        self.assertEqual(
-            result.logits.shape,
-            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
-        )
-
-    @unittest.skip(reason="Starcoder2 buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Starcoder2 uses GQA on all models so the KV cache is a non standard format")
-    def test_past_key_values_format(self):
-        pass
-
-
-@slow
-@require_mindspore_gpu
-class Starcoder2IntegrationTest(unittest.TestCase):
-    def test_starcoder2_batched_generation_sdpa(self):
-        EXPECTED_TEXT = [
-            "Hello my name is Younes and I am a student at the University of Liverpool. I am currently studying for my MSc in Computer Science. I am interested in the field of Machine Learning and I am currently working on",
-            "def hello_world():\n\treturn 'Hello World!'\n\n@app.route('/hello/<name>')\ndef hello_name(name):\n\treturn 'Hello %s!' % name\n\n@app",
-        ]
-        model_id = "bigcode/starcoder2-7b"
-
-        model = Starcoder2ForCausalLM.from_pretrained(
-            model_id, torch_dtype=mindspore.float16, device_map="auto", attn_implementation="sdpa"
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        tokenizer.pad_token = tokenizer.eos_token
-
-        text = ["Hello my name is Younes and", "def hello_world():"]
-        inputs = tokenizer(text, return_tensors="ms", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=40, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT, output_text)
-
-    def test_starcoder2_batched_generation_eager(self):
-        EXPECTED_TEXT = [
-            "Hello my name is Younes and I am a student at the University of Liverpool. I am currently studying for my MSc in Computer Science. I am interested in the field of Machine Learning and I am currently working on",
-            "def hello_world():\n\treturn 'Hello World!'\n\n@app.route('/hello/<name>')\ndef hello_name(name):\n\treturn 'Hello %s!' % name\n\n@app",
-        ]
-        model_id = "bigcode/starcoder2-7b"
-
-        model = Starcoder2ForCausalLM.from_pretrained(
-            model_id, torch_dtype=mindspore.float16, device_map="auto", attn_implementation="eager"
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        tokenizer.pad_token = tokenizer.eos_token
-
-        text = ["Hello my name is Younes and", "def hello_world():"]
-        inputs = tokenizer(text, return_tensors="ms", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=40, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT, output_text)
-
-    # @require_flash_attn
-    # @pytest.mark.flash_attn_test
-    # def test_starcoder2_batched_generation_fa2(self):
-    #     EXPECTED_TEXT = [
-    #         "Hello my name is Younes and I am a student at the University of Liverpool. I am currently studying for my MSc in Computer Science. I am interested in the field of Machine Learning and I am currently working on",
-    #         "def hello_world():\n\treturn 'Hello World!'\n\n@app.route('/hello/<name>')\ndef hello_name(name):\n\treturn 'Hello %s!' % name\n\n@app",
-    #     ]
-    #     model_id = "bigcode/starcoder2-7b"
-
-    #     model = Starcoder2ForCausalLM.from_pretrained(
-    #         model_id, torch_dtype=mindspore.float16, device_map="auto", attn_implementation="flash_attention_2"
-    #     )
-    #     tokenizer = AutoTokenizer.from_pretrained(model_id)
-    #     tokenizer.pad_token = tokenizer.eos_token
-
-    #     text = ["Hello my name is Younes and", "def hello_world():"]
-    #     inputs = tokenizer(text, return_tensors="ms", padding=True)
-
-    #     output = model.generate(**inputs, max_new_tokens=40, do_sample=False)
-    #     output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-    #     self.assertEqual(EXPECTED_TEXT, output_text)
-
-    # @require_bitsandbytes
-    # def test_starcoder2_batched_generation_4bit(self):
-    #     EXPECTED_TEXT = [
-    #         'Hello my name is Younes and I am a student at the University of Maryland. I am currently working on a project that is related to the topic of "How to make a game". I am currently working on a project',
-    #         'def hello_world():\n\treturn "Hello World"\n\n@app.route(\'/hello/<name>\')\ndef hello_name(name):\n\treturn "Hello " + name\n\n@app.route',
-    #     ]
-    #     model_id = "bigcode/starcoder2-7b"
-
-    #     model = Starcoder2ForCausalLM.from_pretrained(model_id, load_in_4bit=True)
-    #     tokenizer = AutoTokenizer.from_pretrained(model_id)
-    #     tokenizer.pad_token = tokenizer.eos_token
-
-    #     text = ["Hello my name is Younes and", "def hello_world():"]
-    #     inputs = tokenizer(text, return_tensors="ms", padding=True)
-
-    #     output = model.generate(**inputs, max_new_tokens=40, do_sample=False)
-    #     output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-    #     self.assertEqual(EXPECTED_TEXT, output_text)
\ No newline at end of file
diff --git a/tests/transformers/models/superpoint/__init__.py b/tests/transformers/models/superpoint/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/superpoint/test_image_processing_superpoint.py b/tests/transformers/models/superpoint/test_image_processing_superpoint.py
deleted file mode 100644
index 6ce2e58f8..000000000
--- a/tests/transformers/models/superpoint/test_image_processing_superpoint.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-import numpy as np
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils import is_vision_available
-
-from ...test_image_processing_common import (
-    ImageProcessingTestMixin,
-    prepare_image_inputs,
-)
-
-
-if is_vision_available():
-    from mindnlp.transformers import SuperPointImageProcessor
-
-
-class SuperPointImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-    ):
-        size = size if size is not None else {"height": 480, "width": 640}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.size["height"], self.size["width"]
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class SuperPointImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = SuperPointImageProcessor if is_vision_available() else None
-
-    def setUp(self) -> None:
-        super().setUp()
-        self.image_processor_tester = SuperPointImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processing(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "do_rescale"))
-        self.assertTrue(hasattr(image_processing, "rescale_factor"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"height": 480, "width": 640})
-
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size={"height": 42, "width": 42}
-        )
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
-
-    @unittest.skip(reason="SuperPointImageProcessor is always supposed to return a grayscaled image")
-    def test_call_numpy_4_channels(self):
-        pass
-
-    def test_input_image_properly_converted_to_grayscale(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        image_inputs = self.image_processor_tester.prepare_image_inputs()
-        pre_processed_images = image_processor.preprocess(image_inputs)
-        for image in pre_processed_images["pixel_values"]:
-            self.assertTrue(np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...]))
diff --git a/tests/transformers/models/superpoint/test_modeling_superpoint.py b/tests/transformers/models/superpoint/test_modeling_superpoint.py
deleted file mode 100644
index 4ffd5816a..000000000
--- a/tests/transformers/models/superpoint/test_modeling_superpoint.py
+++ /dev/null
@@ -1,308 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-import inspect
-import unittest
-import numpy as np
-from typing import List
-from mindnlp.core import ops
-
-from mindnlp.transformers import SuperPointConfig
-from mindnlp.utils.testing_utils import slow, require_vision, require_mindspore, is_mindspore_available, is_vision_available
-from mindnlp.utils import cached_property
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import no_grad
-
-    from mindnlp.transformers import (
-        SuperPointForKeypointDetection,
-    )
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import AutoImageProcessor
-
-class SuperPointModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,
-        image_width=80,
-        image_height=60,
-        encoder_hidden_sizes: List[int] = [32, 32, 64, 64],
-        decoder_hidden_size: int = 128,
-        keypoint_decoder_dim: int = 65,
-        descriptor_decoder_dim: int = 128,
-        keypoint_threshold: float = 0.005,
-        max_keypoints: int = -1,
-        nms_radius: int = 4,
-        border_removal_distance: int = 4,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_width = image_width
-        self.image_height = image_height
-
-        self.encoder_hidden_sizes = encoder_hidden_sizes
-        self.decoder_hidden_size = decoder_hidden_size
-        self.keypoint_decoder_dim = keypoint_decoder_dim
-        self.descriptor_decoder_dim = descriptor_decoder_dim
-        self.keypoint_threshold = keypoint_threshold
-        self.max_keypoints = max_keypoints
-        self.nms_radius = nms_radius
-        self.border_removal_distance = border_removal_distance
-
-    def prepare_config_and_inputs(self):
-        # SuperPoint expects a grayscale image as input
-        pixel_values = floats_tensor([self.batch_size, 3, self.image_height, self.image_width])
-        config = self.get_config()
-        return config, pixel_values
-
-    def get_config(self):
-        return SuperPointConfig(
-            encoder_hidden_sizes=self.encoder_hidden_sizes,
-            decoder_hidden_size=self.decoder_hidden_size,
-            keypoint_decoder_dim=self.keypoint_decoder_dim,
-            descriptor_decoder_dim=self.descriptor_decoder_dim,
-            keypoint_threshold=self.keypoint_threshold,
-            max_keypoints=self.max_keypoints,
-            nms_radius=self.nms_radius,
-            border_removal_distance=self.border_removal_distance,
-        )
-
-    def create_and_check_keypoint_detection(self, config, pixel_values):
-        model = SuperPointForKeypointDetection(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.keypoints.shape[0], self.batch_size)
-        self.parent.assertEqual(result.keypoints.shape[-1], 2)
-
-        result = model(pixel_values, output_hidden_states=True)
-        self.parent.assertEqual(
-            result.hidden_states[-1].shape,
-            (
-                self.batch_size,
-                self.encoder_hidden_sizes[-1],
-                self.image_height // 8,
-                self.image_width // 8,
-            ),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class SuperPointModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (SuperPointForKeypointDetection,) if is_mindspore_available() else ()
-    all_generative_model_classes = () if is_mindspore_available() else ()
-
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-    from_pretrained_id = "magic-leap-community/superpoint"
-
-    def setUp(self):
-        self.model_tester = SuperPointModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=SuperPointConfig,
-            has_text_modality=False,
-            hidden_size=37,
-            common_properties=["encoder_hidden_sizes", "decoder_hidden_size"],
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="SuperPointForKeypointDetection does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="NotImplemented")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="SuperPointForKeypointDetection does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="SuperPointForKeypointDetection does not use feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip(reason="SuperPointForKeypointDetection does not support training")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="SuperPointForKeypointDetection does not support training")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(reason="SuperPointForKeypointDetection does not support training")
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(reason="SuperPointForKeypointDetection does not support training")
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="SuperPoint does not output any loss term in the forward pass")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    def test_keypoint_detection(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_keypoint_detection(*config_and_inputs)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            # SuperPoint's feature maps are of shape (batch_size, num_channels, width, height)
-            for i, conv_layer_size in enumerate(self.model_tester.encoder_hidden_sizes[:-1]):
-                self.assertListEqual(
-                    list(hidden_states[i].shape[-3:]),
-                    [
-                        conv_layer_size,
-                        self.model_tester.image_height // (2 ** (i + 1)),
-                        self.model_tester.image_width // (2 ** (i + 1)),
-                    ],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = SuperPointForKeypointDetection.from_pretrained(self.from_pretrained_id,from_pt = True)
-        self.assertIsNotNone(model)
-
-    def test_forward_labels_should_be_none(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-
-            with no_grad():
-                model_inputs = self._prepare_for_class(inputs_dict, model_class)
-                # Provide an arbitrary sized Tensor as labels to model inputs
-                model_inputs["labels"] = ops.rand((128, 128))
-
-                with self.assertRaises(ValueError) as cm:
-                    model(**model_inputs)
-                self.assertEqual(ValueError, cm.exception.__class__)
-
-
-def prepare_imgs():
-    image1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    image2 = Image.open("./tests/fixtures/tests_samples/COCO/000000004016.png")
-    return [image1,image2]
-
-
-@require_mindspore
-@require_vision
-class SuperPointModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("magic-leap-community/superpoint") if is_vision_available() else None
-
-    @slow
-    def test_inference(self):
-        model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
-        preprocessor = self.default_image_processor
-        images = prepare_imgs()
-        inputs = preprocessor(images=images, return_tensors="ms")     #return_tensors="ms"  to  return_tensors="ms"
-        with no_grad():
-            outputs = model(**inputs)
-        expected_number_keypoints_image0 = 567
-        expected_number_keypoints_image1 = 830
-        expected_max_number_keypoints = max(expected_number_keypoints_image0, expected_number_keypoints_image1)
-        expected_keypoints_shape = (len(images), expected_max_number_keypoints,2)
-        expected_scores_shape =(
-                len(images),
-                expected_max_number_keypoints,
-            )
-        
-        expected_descriptors_shape = (len(images), expected_max_number_keypoints, 256)
-        # Check output shapes
-        self.assertEqual(outputs.keypoints.shape, expected_keypoints_shape)
-        self.assertEqual(outputs.scores.shape, expected_scores_shape)
-        self.assertEqual(outputs.descriptors.shape, expected_descriptors_shape)
-        expected_keypoints_image0_values = mindspore.tensor([[480.0, 9.0], [494.0, 9.0], [489.0, 16.0]])
-        expected_scores_image0_values = mindspore.tensor(
-            [0.0064, 0.0137, 0.0589, 0.0723, 0.5166, 0.0174, 0.1515, 0.2054, 0.0334]
-        )
-        expected_descriptors_image0_value = mindspore.tensor(-0.1096)
-        predicted_keypoints_image0_values = outputs.keypoints[0, :3]
-        predicted_scores_image0_values = outputs.scores[0, :9]
-        predicted_descriptors_image0_value = outputs.descriptors[0, 0, 0]
-        # Check output values
-        self.assertTrue(
-            np.allclose(
-                predicted_keypoints_image0_values.asnumpy(),   
-                expected_keypoints_image0_values.asnumpy(),
-                atol=1e-3,
-            )
-        )
-        self.assertTrue(np.allclose(predicted_scores_image0_values.asnumpy(), expected_scores_image0_values.asnumpy(), atol=1e-3))
-        self.assertTrue(
-            np.allclose(
-                predicted_descriptors_image0_value.asnumpy(),
-                expected_descriptors_image0_value.asnumpy(),
-                atol=1e-3,
-            )
-        )
-        # Check mask values
-        self.assertTrue(outputs.mask[0, expected_number_keypoints_image0].item() == 1)
-        self.assertTrue(outputs.mask[0, expected_number_keypoints_image0+1].item() == 0)
-        self.assertTrue(ops.all(outputs.mask[0, : expected_number_keypoints_image0 - 1]))
-        self.assertTrue(ops.all(ops.logical_not(outputs.mask[0, expected_number_keypoints_image0+1:])))
-        self.assertTrue(ops.all(outputs.mask[1]))
diff --git a/tests/transformers/models/swiftformer/__init__.py b/tests/transformers/models/swiftformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/swiftformer/test_modeling_swiftformer.py b/tests/transformers/models/swiftformer/test_modeling_swiftformer.py
deleted file mode 100644
index 4e0979266..000000000
--- a/tests/transformers/models/swiftformer/test_modeling_swiftformer.py
+++ /dev/null
@@ -1,284 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore SwiftFormer model."""
-
-import copy
-import unittest
-
-from mindnlp.transformers import PretrainedConfig, SwiftFormerConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import SwiftFormerForImageClassification, SwiftFormerModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import ViTImageProcessor
-
-
-class SwiftFormerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        image_size=224,
-        num_labels=3,
-        layer_depths=[1, 1, 1, 1],
-        embed_dims=[16, 16, 32, 32],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_labels = num_labels
-        self.image_size = image_size
-        self.layer_depths = layer_depths
-        self.embed_dims = embed_dims
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return SwiftFormerConfig(
-            depths=self.layer_depths,
-            embed_dims=self.embed_dims,
-            mlp_ratio=4,
-            downsamples=[True, True, True, True],
-            hidden_act="gelu",
-            num_labels=self.num_labels,
-            down_patch_size=3,
-            down_stride=2,
-            down_pad=1,
-            drop_rate=0.0,
-            drop_path_rate=0.0,
-            use_layer_scale=True,
-            layer_scale_init_value=1e-5,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = SwiftFormerModel(config=config)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.embed_dims[-1], 7, 7))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = SwiftFormerForImageClassification(config)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-        model = SwiftFormerForImageClassification(config)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        (config, pixel_values, labels) = self.prepare_config_and_inputs()
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class SwiftFormerModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as SwiftFormer does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (SwiftFormerModel, SwiftFormerForImageClassification) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"image-feature-extraction": SwiftFormerModel, "image-classification": SwiftFormerForImageClassification}
-        if is_mindspore_available()
-        else {}
-    )
-
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = SwiftFormerModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=SwiftFormerConfig,
-            has_text_modality=False,
-            hidden_size=37,
-            num_attention_heads=12,
-            num_hidden_layers=12,
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="SwiftFormer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "MBZUAI/swiftformer-xs"
-        model = SwiftFormerModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="SwiftFormer does not output attentions")
-    def test_attention_outputs(self):
-        pass
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_stages = 8
-            self.assertEqual(len(hidden_states), expected_num_stages)  # TODO
-
-            # SwiftFormer's feature maps are of shape (batch_size, embed_dims, height, width)
-            # with the width and height being successively divided by 2, after every 2 blocks
-            for i in range(len(hidden_states)):
-                self.assertEqual(
-                    hidden_states[i].shape,
-                    (
-                        self.model_tester.batch_size,
-                        self.model_tester.embed_dims[i // 2],
-                        (self.model_tester.image_size // 4) // 2 ** (i // 2),
-                        (self.model_tester.image_size // 4) // 2 ** (i // 2),
-                    ),
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_initialization(self):
-        def _config_zero_init(config):
-            configs_no_init = copy.deepcopy(config)
-            for key in configs_no_init.__dict__.keys():
-                if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
-                    setattr(configs_no_init, key, 1e-10)
-                if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
-                    no_init_subconfig = _config_zero_init(getattr(configs_no_init, key))
-                    setattr(configs_no_init, key, no_init_subconfig)
-            return configs_no_init
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.mean() * 1e9) / 1e9).round().item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class SwiftFormerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return ViTImageProcessor.from_pretrained("MBZUAI/swiftformer-xs") if is_vision_available() else None
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = SwiftFormerForImageClassification.from_pretrained("MBZUAI/swiftformer-xs")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([[-2.1703e00, 2.1107e00, -2.0811e00]])
-        self.assertTrue(ops.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/swin/__init__.py b/tests/transformers/models/swin/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/swin/test_modeling_swin.py b/tests/transformers/models/swin/test_modeling_swin.py
deleted file mode 100644
index c2c2faf03..000000000
--- a/tests/transformers/models/swin/test_modeling_swin.py
+++ /dev/null
@@ -1,513 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Swin model."""
-
-import collections
-import unittest
-
-from mindnlp.transformers import SwinConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_backbone_common import BackboneTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import SwinBackbone, SwinForImageClassification, SwinForMaskedImageModeling, SwinModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import AutoImageProcessor
-
-
-class SwinModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=32,
-        patch_size=2,
-        num_channels=3,
-        embed_dim=16,
-        depths=[1, 2, 1],
-        num_heads=[2, 2, 4],
-        window_size=2,
-        mlp_ratio=2.0,
-        qkv_bias=True,
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        drop_path_rate=0.1,
-        hidden_act="gelu",
-        use_absolute_embeddings=False,
-        patch_norm=True,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        is_training=True,
-        scope=None,
-        use_labels=True,
-        type_sequence_label_size=10,
-        encoder_stride=8,
-        out_features=["stage1", "stage2"],
-        out_indices=[1, 2],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.embed_dim = embed_dim
-        self.depths = depths
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.mlp_ratio = mlp_ratio
-        self.qkv_bias = qkv_bias
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.drop_path_rate = drop_path_rate
-        self.hidden_act = hidden_act
-        self.use_absolute_embeddings = use_absolute_embeddings
-        self.patch_norm = patch_norm
-        self.layer_norm_eps = layer_norm_eps
-        self.initializer_range = initializer_range
-        self.is_training = is_training
-        self.scope = scope
-        self.use_labels = use_labels
-        self.type_sequence_label_size = type_sequence_label_size
-        self.encoder_stride = encoder_stride
-        self.out_features = out_features
-        self.out_indices = out_indices
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return SwinConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            embed_dim=self.embed_dim,
-            depths=self.depths,
-            num_heads=self.num_heads,
-            window_size=self.window_size,
-            mlp_ratio=self.mlp_ratio,
-            qkv_bias=self.qkv_bias,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            drop_path_rate=self.drop_path_rate,
-            hidden_act=self.hidden_act,
-            use_absolute_embeddings=self.use_absolute_embeddings,
-            path_norm=self.patch_norm,
-            layer_norm_eps=self.layer_norm_eps,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-            out_features=self.out_features,
-            out_indices=self.out_indices,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = SwinModel(config=config)
-        model.eval()
-        result = model(pixel_values)
-
-        expected_seq_len = ((config.image_size // config.patch_size) ** 2) // (4 ** (len(config.depths) - 1))
-        expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, expected_dim))
-
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = SwinBackbone(config=config)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify hidden states
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], 16, 16])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = SwinBackbone(config=config)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, model.channels[-1], 4, 4])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-
-    def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
-        model = SwinForMaskedImageModeling(config=config)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
-        )
-
-        # test greyscale images
-        config.num_channels = 1
-        model = SwinForMaskedImageModeling(config)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 1, self.image_size, self.image_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = SwinForImageClassification(config)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = SwinForImageClassification(config)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class SwinModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            SwinModel,
-            SwinBackbone,
-            SwinForImageClassification,
-            SwinForMaskedImageModeling,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"image-feature-extraction": SwinModel, "image-classification": SwinForImageClassification}
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = True
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = SwinModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=SwinConfig,
-            embed_dim=37,
-            has_text_modality=False,
-            common_properties=["image_size", "patch_size", "num_channels"],
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # TODO: check if this works again for PyTorch 2.x.y
-    @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        super().test_training_gradient_checkpointing()
-
-    @unittest.skipIf(mindspore.get_context('device_target') == 'CPU', 'cpu has error')
-    def test_training(self):
-        super().test_training()
-
-    def test_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_backbone(*config_and_inputs)
-
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @unittest.skip(reason="Swin does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Swin Transformer does not use feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            expected_num_attentions = len(self.model_tester.depths)
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            window_size_squared = config.window_size**2
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_heads[0], window_size_squared, window_size_squared],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # also another +1 for reshaped_hidden_states
-            added_hidden_states = 1 if model_class.__name__ == "SwinBackbone" else 2
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), expected_num_attentions)
-
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_heads[0], window_size_squared, window_size_squared],
-            )
-
-    def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
-        model = model_class(config)
-        model.eval()
-
-        with no_grad():
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-        hidden_states = outputs.hidden_states
-
-        expected_num_layers = getattr(
-            self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
-        )
-        self.assertEqual(len(hidden_states), expected_num_layers)
-
-        # Swin has a different seq_length
-        patch_size = (
-            config.patch_size
-            if isinstance(config.patch_size, collections.abc.Iterable)
-            else (config.patch_size, config.patch_size)
-        )
-
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-
-        self.assertListEqual(
-            list(hidden_states[0].shape[-2:]),
-            [num_patches, self.model_tester.embed_dim],
-        )
-
-        if not model_class.__name__ == "SwinBackbone":
-            reshaped_hidden_states = outputs.reshaped_hidden_states
-            self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
-
-            batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
-            reshaped_hidden_states = (
-                reshaped_hidden_states[0].view(batch_size, num_channels, height * width).permute(0, 2, 1)
-            )
-            self.assertListEqual(
-                list(reshaped_hidden_states.shape[-2:]),
-                [num_patches, self.model_tester.embed_dim],
-            )
-
-    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        image_size = (
-            self.model_tester.image_size
-            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
-            else (self.model_tester.image_size, self.model_tester.image_size)
-        )
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
-
-    def test_hidden_states_output_with_padding(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.patch_size = 3
-
-        image_size = (
-            self.model_tester.image_size
-            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
-            else (self.model_tester.image_size, self.model_tester.image_size)
-        )
-        patch_size = (
-            config.patch_size
-            if isinstance(config.patch_size, collections.abc.Iterable)
-            else (config.patch_size, config.patch_size)
-        )
-
-        padded_height = image_size[0] + patch_size[0] - (image_size[0] % patch_size[0])
-        padded_width = image_size[1] + patch_size[1] - (image_size[1] % patch_size[1])
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-            self.check_hidden_states_output(inputs_dict, config, model_class, (padded_height, padded_width))
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/swin-tiny-patch4-window7-224"
-        model = SwinModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "embeddings" not in name and param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-
-@require_vision
-@require_mindspore
-class SwinModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = SwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
-        image_processor = self.default_image_processor
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = mindspore.tensor([-0.0948, -0.6454, -0.0921])
-        self.assertTrue(ops.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_interpolate_pos_encoding(self):
-        # Swin models have an `interpolate_pos_encoding` argument in their forward method,
-        # allowing to interpolate the pre-trained position embeddings in order to use
-        # the model on higher resolutions.
-        model = SwinModel.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
-
-        image_processor = self.default_image_processor
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = image_processor(images=image, size={"height": 481, "width": 481}, return_tensors="ms")
-        pixel_values = inputs.pixel_values
-
-        # forward pass
-        with no_grad():
-            outputs = model(pixel_values, interpolate_pos_encoding=True)
-
-        # verify the logits
-        expected_shape = (1, 256, 768)
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-
-@require_mindspore
-class SwinBackboneTest(unittest.TestCase, BackboneTesterMixin):
-    all_model_classes = (SwinBackbone,) if is_mindspore_available() else ()
-    config_class = SwinConfig
-
-    def setUp(self):
-        self.model_tester = SwinModelTester(self)
\ No newline at end of file
diff --git a/tests/transformers/models/swin2sr/__init__.py b/tests/transformers/models/swin2sr/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/swin2sr/test_image_processing_swin2sr.py b/tests/transformers/models/swin2sr/test_image_processing_swin2sr.py
deleted file mode 100644
index 18f7b8d3b..000000000
--- a/tests/transformers/models/swin2sr/test_image_processing_swin2sr.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_mindspore_available():
-    import mindspore
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import Swin2SRImageProcessor
-    from mindnlp.transformers.image_transforms import get_image_size
-
-
-class Swin2SRImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_pad=True,
-        pad_size=8,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_pad = do_pad
-        self.pad_size = pad_size
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_pad": self.do_pad,
-            "pad_size": self.pad_size,
-        }
-
-    def expected_output_image_shape(self, images):
-        img = images[0]
-
-        if isinstance(img, Image.Image):
-            input_width, input_height = img.size
-        else:
-            input_height, input_width = img.shape[-2:]
-
-        pad_height = (input_height // self.pad_size + 1) * self.pad_size - input_height
-        pad_width = (input_width // self.pad_size + 1) * self.pad_size - input_width
-
-        return self.num_channels, input_height + pad_height, input_width + pad_width
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class Swin2SRImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = Swin2SRImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = Swin2SRImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processor, "do_rescale"))
-        self.assertTrue(hasattr(image_processor, "rescale_factor"))
-        self.assertTrue(hasattr(image_processor, "do_pad"))
-        self.assertTrue(hasattr(image_processor, "pad_size"))
-
-    def calculate_expected_size(self, image):
-        old_height, old_width = get_image_size(image)
-        size = self.image_processor_tester.pad_size
-
-        pad_height = (old_height // size + 1) * size - old_height
-        pad_width = (old_width // size + 1) * size - old_width
-        return old_height + pad_height, old_width + pad_width
-
-    # Swin2SRImageProcessor does not support batched input
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="ms").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-    # Swin2SRImageProcessor does not support batched input
-    def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="ms").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-    # Swin2SRImageProcessor does not support batched input
-    def test_call_numpy_4_channels(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        self.image_processor_tester.num_channels = 4
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = image_processing(
-            image_inputs[0], return_tensors="ms", input_data_format="channels_first"
-        ).pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-        self.image_processor_tester.num_channels = 3
-
-    # Swin2SRImageProcessor does not support batched input
-    def test_call_pytorch(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random MindSpore tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
-
-        for image in image_inputs:
-            self.assertIsInstance(image, mindspore.Tensor)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="ms").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
\ No newline at end of file
diff --git a/tests/transformers/models/swin2sr/test_modeling_swin2sr.py b/tests/transformers/models/swin2sr/test_modeling_swin2sr.py
deleted file mode 100644
index fb49780fc..000000000
--- a/tests/transformers/models/swin2sr/test_modeling_swin2sr.py
+++ /dev/null
@@ -1,350 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch Swin2SR model."""
-
-import unittest
-
-from mindnlp.transformers import Swin2SRConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import Swin2SRForImageSuperResolution, Swin2SRModel
-
-if is_vision_available():
-    from PIL import Image
-    from mindnlp.transformers import Swin2SRImageProcessor
-
-
-class Swin2SRModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=32,
-        patch_size=1,
-        num_channels=3,
-        num_channels_out=1,
-        embed_dim=16,
-        depths=[1, 2, 1],
-        num_heads=[2, 2, 4],
-        window_size=2,
-        mlp_ratio=2.0,
-        qkv_bias=True,
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        drop_path_rate=0.1,
-        hidden_act="gelu",
-        use_absolute_embeddings=False,
-        patch_norm=True,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        is_training=True,
-        scope=None,
-        use_labels=False,
-        upscale=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.num_channels_out = num_channels_out
-        self.embed_dim = embed_dim
-        self.depths = depths
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.mlp_ratio = mlp_ratio
-        self.qkv_bias = qkv_bias
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.drop_path_rate = drop_path_rate
-        self.hidden_act = hidden_act
-        self.use_absolute_embeddings = use_absolute_embeddings
-        self.patch_norm = patch_norm
-        self.layer_norm_eps = layer_norm_eps
-        self.initializer_range = initializer_range
-        self.is_training = is_training
-        self.scope = scope
-        self.use_labels = use_labels
-        self.upscale = upscale
-
-        # here we set some attributes to make tests pass
-        self.num_hidden_layers = len(depths)
-        self.hidden_size = embed_dim
-        self.seq_length = (image_size // patch_size) ** 2
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return Swin2SRConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            num_channels_out=self.num_channels_out,
-            embed_dim=self.embed_dim,
-            depths=self.depths,
-            num_heads=self.num_heads,
-            window_size=self.window_size,
-            mlp_ratio=self.mlp_ratio,
-            qkv_bias=self.qkv_bias,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            drop_path_rate=self.drop_path_rate,
-            hidden_act=self.hidden_act,
-            use_absolute_embeddings=self.use_absolute_embeddings,
-            path_norm=self.patch_norm,
-            layer_norm_eps=self.layer_norm_eps,
-            initializer_range=self.initializer_range,
-            upscale=self.upscale,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = Swin2SRModel(config=config)
-        model.eval()
-        result = model(pixel_values)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.embed_dim, self.image_size, self.image_size)
-        )
-
-    def create_and_check_for_image_super_resolution(self, config, pixel_values, labels):
-        model = Swin2SRForImageSuperResolution(config)
-        model.eval()
-        result = model(pixel_values)
-
-        expected_image_size = self.image_size * self.upscale
-        self.parent.assertEqual(
-            result.reconstruction.shape,
-            (self.batch_size, self.num_channels_out, expected_image_size, expected_image_size),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class Swin2SRModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (Swin2SRModel, Swin2SRForImageSuperResolution) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"image-feature-extraction": Swin2SRModel, "image-to-image": Swin2SRForImageSuperResolution}
-        if is_mindspore_available()
-        else {}
-    )
-
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = Swin2SRModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=Swin2SRConfig,
-            embed_dim=37,
-            has_text_modality=False,
-            common_properties=["image_size", "patch_size", "num_channels"],
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_for_image_super_resolution(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_super_resolution(*config_and_inputs)
-
-    # TODO: check if this works again for PyTorch 2.x.y
-    @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-    @unittest.skip(reason="Swin2SR does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Swin2SR does not support training yet")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="Swin2SR does not support training yet")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "caidas/swin2SR-classical-sr-x2-64"
-        model = Swin2SRModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    # overwriting because of `logit_scale` parameter
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if "logit_scale" in name:
-                    continue
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            expected_num_attentions = len(self.model_tester.depths)
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            window_size_squared = config.window_size**2
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), expected_num_attentions)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_heads[0], window_size_squared, window_size_squared],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            self.assertEqual(out_len + 1, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), expected_num_attentions)
-
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_heads[0], window_size_squared, window_size_squared],
-            )
-
-
-@require_vision
-@require_mindspore
-@slow
-class Swin2SRModelIntegrationTest(unittest.TestCase):
-    def test_inference_image_super_resolution_head(self):
-        processor = Swin2SRImageProcessor()
-        model = Swin2SRForImageSuperResolution.from_pretrained("caidas/swin2SR-classical-sr-x2-64")
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = processor(images=image, return_tensors="ms")
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 3, 976, 1296)
-        self.assertEqual(outputs.reconstruction.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[0.5458, 0.5546, 0.5638], [0.5526, 0.5565, 0.5651], [0.5396, 0.5426, 0.5621]]
-        )
-        self.assertTrue(ops.allclose(outputs.reconstruction[0, 0, :3, :3], expected_slice, atol=1e-4))
-
-    def test_inference_fp16(self):
-        processor = Swin2SRImageProcessor()
-        model = Swin2SRForImageSuperResolution.from_pretrained(
-            "caidas/swin2SR-classical-sr-x2-64", torch_dtype=mindspore.float16
-        )
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = processor(images=image, return_tensors="ms").to(model.dtype)
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 3, 976, 1296)
-        self.assertEqual(outputs.reconstruction.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [[0.5454, 0.5542, 0.5640], [0.5518, 0.5562, 0.5649], [0.5391, 0.5425, 0.5620]], dtype=model.dtype
-        )
-        self.assertTrue(ops.allclose(outputs.reconstruction[0, 0, :3, :3], expected_slice, atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/switch_transformers/__init__.py b/tests/transformers/models/switch_transformers/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/switch_transformers/test_modeling_switch_transformers.py b/tests/transformers/models/switch_transformers/test_modeling_switch_transformers.py
deleted file mode 100644
index 0d3f8ee57..000000000
--- a/tests/transformers/models/switch_transformers/test_modeling_switch_transformers.py
+++ /dev/null
@@ -1,1089 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Google SwitchTransformers Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import copy
-import tempfile
-import unittest
-
-from mindnlp.transformers import SwitchTransformersConfig
-from mindnlp.utils.testing_utils import (
-    require_tokenizers,
-    require_mindspore,
-    slow,
-    is_mindspore_available
-)
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, nn
-    from mindnlp.core.nn import Parameter
-
-    from mindnlp.transformers import (
-        AutoTokenizer,
-        SwitchTransformersEncoderModel,
-        SwitchTransformersForConditionalGeneration,
-        SwitchTransformersModel,
-        SwitchTransformersTop1Router,
-    )
-    from mindnlp.transformers.models.switch_transformers.modeling_switch_transformers import (
-        load_balancing_loss_func,
-        router_z_loss_func,
-    )
-
-
-class SwitchTransformersModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        decoder_seq_length=9,
-        # For common tests
-        is_training=True,
-        use_attention_mask=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        eos_token_id=1,
-        pad_token_id=0,
-        decoder_start_token_id=0,
-        decoder_layers=None,
-        sparse_step=1,
-        num_sparse_decoder_layers=2,
-        num_sparse_encoder_layers=2,
-        expert_capacity=100,
-        router_jitter_noise=0.0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.scope = None
-        self.decoder_layers = decoder_layers
-        self.sparse_step = sparse_step
-        self.num_sparse_decoder_layers = num_sparse_decoder_layers
-        self.num_sparse_encoder_layers = num_sparse_encoder_layers
-        self.expert_capacity = expert_capacity
-        self.router_jitter_noise = router_jitter_noise
-
-    def get_large_model_config(self):
-        return SwitchTransformersConfig.from_pretrained("google/switch-base-8")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        decoder_attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        )
-
-    def get_pipeline_config(self):
-        return SwitchTransformersConfig(
-            vocab_size=166,  # switch_transformers forces 100 extra tokens
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            expert_capacity=self.expert_capacity,
-            router_jitter_noise=self.router_jitter_noise,
-        )
-
-    def get_config(self):
-        return SwitchTransformersConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            sparse_step=self.sparse_step,
-            num_sparse_encoder_layers=self.num_sparse_encoder_layers,
-            num_sparse_decoder_layers=self.num_sparse_decoder_layers,
-        )
-
-    def check_prepare_lm_labels_via_shift_left(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = SwitchTransformersModel(config=config)
-        model.eval()
-
-        # make sure that lm_labels are correctly padded from the right
-        lm_labels = lm_labels.masked_fill((lm_labels == self.decoder_start_token_id), self.eos_token_id)
-
-        # add casaul pad token mask
-        triangular_mask = ops.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
-        lm_labels = lm_labels.masked_fill(triangular_mask, self.pad_token_id)
-        decoder_input_ids = model._shift_right(lm_labels)
-
-        for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
-            # first item
-            self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
-            if i < decoder_input_ids_slice.shape[-1]:
-                if i < decoder_input_ids.shape[-1] - 1:
-                    # items before diagonal
-                    self.parent.assertListEqual(
-                        decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
-                    )
-                # pad items after diagonal
-                if i < decoder_input_ids.shape[-1] - 2:
-                    self.parent.assertListEqual(
-                        decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
-                    )
-            else:
-                # all items after square
-                self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = SwitchTransformersModel(config=config)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        decoder_output = result.last_hidden_state
-        decoder_past = result.past_key_values
-        encoder_output = result.encoder_last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-        self.parent.assertEqual(decoder_output.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size))
-        # There should be `num_layers` key value embeddings stored in decoder_past
-        self.parent.assertEqual(len(decoder_past), config.num_layers)
-        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
-        self.parent.assertEqual(len(decoder_past[0]), 4)
-
-    def create_and_check_with_lm_head(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = SwitchTransformersForConditionalGeneration(config=config).eval()
-        outputs = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            labels=lm_labels,
-        )
-        self.parent.assertEqual(len(outputs), 10)
-        self.parent.assertEqual(outputs["logits"].shape, (self.batch_size, self.decoder_seq_length, self.vocab_size))
-        self.parent.assertEqual(outputs["loss"].shape, ())
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = SwitchTransformersModel(config=config).get_decoder().eval()
-        # first forward pass
-        outputs = model(input_ids, use_cache=True, output_router_logits=False)
-        outputs_use_cache_conf = model(input_ids, output_router_logits=False)
-        outputs_no_past = model(input_ids, use_cache=False, output_router_logits=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids, output_router_logits=False)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values, output_router_logits=False)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = SwitchTransformersModel(config=config).get_decoder()
-        model.eval()
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past_key_values = model(
-            input_ids, attention_mask=attn_mask, use_cache=True, output_router_logits=False
-        ).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask, output_router_logits=False)[
-            "last_hidden_state"
-        ]
-        output_from_past = model(
-            next_tokens, past_key_values=past_key_values, attention_mask=attn_mask, output_router_logits=False
-        )["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = SwitchTransformersModel(config=config).get_decoder().eval()
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True, output_router_logits=False)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask, output_router_logits=False)[
-            "last_hidden_state"
-        ]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            past_key_values=past_key_values,
-            output_router_logits=False,
-        )["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    @slow
-    def create_and_check_generate_with_past_key_values(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        r"""
-        This test does not pass for small models due to precision errors. It is therefore only run for slightly larger models.
-        """
-        model = (
-            SwitchTransformersForConditionalGeneration.from_pretrained("google/switch-base-8").eval()
-        )
-        mindspore.manual_seed(0)
-        output_without_past_cache = model.generate(
-            input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
-        )
-        mindspore.manual_seed(0)
-        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
-        self.parent.assertTrue(ops.all(output_with_past_cache == output_without_past_cache))
-
-    def create_and_check_model_fp16_forward(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = SwitchTransformersModel(config=config).half().eval()
-        output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"]
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-    def create_and_check_encoder_decoder_shared_weights(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        for model_class in [SwitchTransformersModel, SwitchTransformersForConditionalGeneration]:
-            mindspore.manual_seed(0)
-            mindspore.set_seed(0)
-            model = model_class(config=config).eval()
-            # load state dict copies weights but does not tie them
-            model.encoder.load_state_dict(model.decoder.state_dict(), strict=False)
-
-            mindspore.manual_seed(0)
-            mindspore.set_seed(0)
-            tied_config = copy.deepcopy(config)
-            tied_config.tie_encoder_decoder = True
-            tied_model = model_class(config=tied_config).eval()
-
-            model_result = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            tied_model_result = tied_model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            # check that models has less parameters
-            self.parent.assertLess(
-                sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
-            )
-            random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
-
-            # check that outputs are equal
-            print(model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx])
-            self.parent.assertTrue(
-                ops.allclose(
-                    model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
-                )
-            )
-
-            # check that outputs after saving and loading are equal
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tied_model.save_pretrained(tmpdirname)
-                tied_model = model_class.from_pretrained(tmpdirname)
-                tied_model.eval()
-
-                # check that models has less parameters
-                self.parent.assertLess(
-                    sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
-                )
-                random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
-
-                tied_model_result = tied_model(
-                    input_ids=input_ids,
-                    decoder_input_ids=decoder_input_ids,
-                    attention_mask=attention_mask,
-                    decoder_attention_mask=decoder_attention_mask,
-                )
-
-                # check that outputs are equal
-                self.parent.assertTrue(
-                    ops.allclose(
-                        model_result[0][0, :, random_slice_idx],
-                        tied_model_result[0][0, :, random_slice_idx],
-                        atol=1e-4,
-                    )
-                )
-
-    def check_resize_embeddings_switch_transformers_v1_1(
-        self,
-        config,
-    ):
-        prev_vocab_size = config.vocab_size
-
-        config.tie_word_embeddings = False
-        model = SwitchTransformersForConditionalGeneration(config=config).eval()
-        model.resize_token_embeddings(prev_vocab_size - 10)
-
-        self.parent.assertEqual(model.get_input_embeddings().weight.shape[0], prev_vocab_size - 10)
-        self.parent.assertEqual(model.get_output_embeddings().weight.shape[0], prev_vocab_size - 10)
-        self.parent.assertEqual(model.config.vocab_size, prev_vocab_size - 10)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "use_cache": False,
-            "output_router_logits": False,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class SwitchTransformersModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (SwitchTransformersModel, SwitchTransformersForConditionalGeneration) if is_mindspore_available() else ()
-    )
-    all_generative_model_classes = (SwitchTransformersForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": SwitchTransformersModel,
-            "summarization": SwitchTransformersForConditionalGeneration,
-            "text2text-generation": SwitchTransformersForConditionalGeneration,
-            "translation": SwitchTransformersForConditionalGeneration,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = True
-    test_model_parallel = False
-    is_encoder_decoder = True
-    test_torchscript = False
-    # The small SWITCH_TRANSFORMERS model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = SwitchTransformersModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=SwitchTransformersConfig, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_shift_right(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_v1_1(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        # check that gated gelu feed forward and different word embeddings work
-        config = config_and_inputs[0]
-        config.tie_word_embeddings = False
-        config.feed_forward_proj = "gated-gelu"
-        self.model_tester.create_and_check_model(config, *config_and_inputs[1:])
-
-    def test_config_and_model_silu_gated(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config = config_and_inputs[0]
-        config.feed_forward_proj = "gated-silu"
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_with_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_past_with_attn_mask(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    def test_decoder_model_past_with_3d_attn_mask(self):
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = self.model_tester.prepare_config_and_inputs()
-
-        attention_mask = ids_tensor(
-            [self.model_tester.batch_size, self.model_tester.encoder_seq_length, self.model_tester.encoder_seq_length],
-            vocab_size=2,
-        )
-        decoder_attention_mask = ids_tensor(
-            [self.model_tester.batch_size, self.model_tester.decoder_seq_length, self.model_tester.decoder_seq_length],
-            vocab_size=2,
-        )
-
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        )
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_generate_with_past_key_values(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_generate_with_past_key_values(*config_and_inputs)
-
-    def test_encoder_decoder_shared_weights(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
-
-    def test_model_fp16_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
-
-    def test_v1_1_resize_embeddings(self):
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        self.model_tester.check_resize_embeddings_switch_transformers_v1_1(config)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/switch-base-8"
-        model = SwitchTransformersModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_generate_with_head_masking(self):
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config = config_and_inputs[0]
-        max_length = config_and_inputs[1].shape[-1] + 3
-        model = SwitchTransformersForConditionalGeneration(config).eval()
-
-        head_masking = {
-            "head_mask": ops.zeros(config.num_layers, config.num_heads),
-            "decoder_head_mask": ops.zeros(config.num_decoder_layers, config.num_heads),
-            "cross_attn_head_mask": ops.zeros(config.num_decoder_layers, config.num_heads),
-        }
-
-        for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
-            head_masks = {name: mask}
-            # Explicitly pass decoder_head_mask as it is required from SWITCH_TRANSFORMERS model when head_mask specified
-            if name == "head_mask":
-                head_masks["decoder_head_mask"] = ops.ones(
-                    config.num_decoder_layers, config.num_heads
-                )
-
-            out = model.generate(
-                config_and_inputs[1],
-                num_beams=1,
-                max_length=max_length,
-                output_attentions=True,
-                return_dict_in_generate=True,
-                **head_masks,
-            )
-            # We check the state of decoder_attentions and cross_attentions just from the last step
-            attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
-            self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
-
-    @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
-    )
-    def test_load_save_without_tied_weights(self):
-        pass
-
-
-class SwitchTransformersEncoderOnlyModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        # For common tests
-        use_attention_mask=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        is_training=False,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        is_encoder_decoder=False,
-        eos_token_id=1,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        # For common tests
-        self.seq_length = self.encoder_seq_length
-        self.use_attention_mask = use_attention_mask
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.is_encoder_decoder = is_encoder_decoder
-        self.scope = None
-        self.is_training = is_training
-
-    def get_large_model_config(self):
-        return SwitchTransformersConfig.from_pretrained("google/switch-base-8")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-
-        config = SwitchTransformersConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return config, input_ids, attention_mask
-
-    def create_and_check_model(self, config, input_ids, attention_mask):
-        model = SwitchTransformersEncoderModel(config=config)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-        )
-        result = model(input_ids=input_ids)
-        encoder_output = result.last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-
-    def create_and_check_model_fp16_forward(self, config, input_ids, attention_mask):
-        model = SwitchTransformersEncoderModel(config=config).half().eval()
-        output = model(input_ids, attention_mask=attention_mask)["last_hidden_state"]
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-class SwitchTransformersEncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (SwitchTransformersEncoderModel,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_resize_embeddings = False
-    test_model_parallel = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = SwitchTransformersEncoderOnlyModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=SwitchTransformersConfig, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_fp16_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
-
-    @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
-    )
-    def test_load_save_without_tied_weights(self):
-        pass
-
-
-def use_task_specific_params(model, task):
-    model.config.update(model.config.task_specific_params[task])
-
-
-@require_mindspore
-class TestAsymmetricSwitchTransformers(unittest.TestCase):
-    def build_model_and_check_forward_pass(self, **kwargs):
-        tester = SwitchTransformersModelTester(self, **kwargs)
-        config, *inputs = tester.prepare_config_and_inputs()
-        (
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = inputs
-        model = SwitchTransformersForConditionalGeneration(config=config).eval()
-        outputs = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            labels=lm_labels,
-            output_router_logits=False,
-        )
-        # outputs = model(*inputs)
-        assert len(outputs) == 4
-        assert outputs["logits"].shape == (tester.batch_size, tester.decoder_seq_length, tester.vocab_size)
-        assert outputs["loss"].shape == ()
-        return model
-
-    def test_small_decoder(self):
-        # num_hidden_layers is passed to SwitchTransformersConfig as num_layers
-        model = self.build_model_and_check_forward_pass(decoder_layers=1, num_hidden_layers=2)
-        assert len(model.encoder.block) == 2
-        assert len(model.decoder.block) == 1
-
-    def test_defaulting_to_symmetry(self):
-        # num_hidden_layers is passed to SwitchTransformersConfig as num_layers
-        model = self.build_model_and_check_forward_pass(num_hidden_layers=2)
-        assert len(model.decoder.block) == len(model.encoder.block) == 2
-
-
-@require_mindspore
-class SwitchTransformerRouterTest(unittest.TestCase):
-    r"""
-    Switch Transformers has different blocks from classic transformer based models.
-    The Swift MLP contains a Router class, that has to be tested to check if it is correctly implemented
-
-    Original implementation of the routers here:
-
-    """
-
-    config = SwitchTransformersConfig(
-        num_experts=2,
-        hidden_size=8,
-        d_ff=16,
-        router_jitter_noise=0,
-        expert_capacity=4,
-    )
-
-    def test_equivalency_balancy_loss(self):
-        r"""
-        This test checks if the balancy loss is correctly implemented
-        as in the original implementation of the Switch Transformer .
-        """
-        router_probs = mindspore.Tensor(
-            [
-                [0.35490513, 0.60419905],
-                [0.4275843, 0.23061597],
-                [0.32985854, 0.43953657],
-                [0.25099766, 0.27730572],
-                [0.7678207, 0.71474564],
-            ]
-        )
-
-        expert_indices = mindspore.Tensor([[0], [1], [1], [0], [0]]).to(mindspore.int32)
-
-        loss = load_balancing_loss_func(router_probs, expert_indices)
-        self.assertAlmostEqual(loss.item(), 0.8741045, places=5)
-
-    def test_equivalency_router_z_loss(self):
-        r"""
-        This test checks if the router z loss is correctly implemented
-        as in the original implementation of the Switch Transformer .
-        """
-        logits = mindspore.Tensor(
-            [
-                [
-                    [-4.2124424, 3.891939, -3.6481273, 1.8849981],
-                    [0.32625437, 2.918651, 0.84758997, -4.556842],
-                    [-3.32062, 4.6977115, -0.15439987, 0.44086337],
-                    [3.4467149, 4.3436565, -4.7224274, -4.264637],
-                    [-2.224406, -2.5318158, -1.3832569, 1.1891162],
-                    [-2.320062, -0.44705987, 4.289819, -0.00662684],
-                ],
-                [
-                    [0.99470854, -0.6992364, 0.25503993, 4.2952085],
-                    [3.5937333, -3.2408535, -4.298278, 4.426601],
-                    [0.7669008, 2.6588762, 2.4505413, 4.6051874],
-                    [0.23330331, -3.0845237, 0.6262374, -2.9865491],
-                    [0.7595146, -2.1099675, -4.155346, -2.8326452],
-                    [2.3771453, 1.004138, -3.1781673, 0.7581556],
-                ],
-            ]
-        )
-
-        loss = router_z_loss_func(logits)
-        self.assertAlmostEqual(loss.item(), 13.786719, places=5)
-
-    def test_equivalency_token_chose_masked_router(self):
-        r"""
-        This test tests the equivalency between the `SwitchTransformersTop1Router`
-        originally implemented from here: TODO: provide link
-        """
-
-        input_tokens = mindspore.Tensor(
-            [
-                [
-                    [0.6433916, 0.18188512, 0.02240455, 0.563781],
-                    [0.5526401, 0.0958724, 0.34253013, 0.03644359],
-                    [0.08744538, 0.7909105, 0.35205448, 0.53364205],
-                ],
-                [
-                    [0.02900076, 0.4168595, 0.5802449, 0.91486526],
-                    [0.27414513, 0.14991808, 0.9383501, 0.5209162],
-                    [0.51207185, 0.90618336, 0.7309413, 0.95533276],
-                ],
-            ]
-        )
-
-        model = SwitchTransformersTop1Router(self.config)
-
-        model.classifier.weight = Parameter(
-            mindspore.Tensor(
-                [
-                    [0.02008116, 0.00620062],
-                    [-0.00811031, -0.00031623],
-                    [-0.03542127, 0.02703803],
-                    [0.02335377, -0.02971946],
-                ],
-            ).t()
-        )
-
-        expert_index, _, router_logits = model(input_tokens)
-        router_probs = ops.softmax(router_logits, dim=-1)
-
-        router_z_loss = router_z_loss_func(router_logits)
-        auxiliary_loss = load_balancing_loss_func(router_probs, ops.argmax(expert_index, dim=-1))
-
-        self.assertAlmostEqual(auxiliary_loss.item(), 1.000308, places=5)
-        self.assertAlmostEqual(router_z_loss.item(), 0.4789799, places=5)
-
-        # self.assertTrue(ops.allclose(expert_index.bool().unsqueeze(-1), expected_dispatch_mask))
-
-    def test_max_routing_capacity(self):
-        model = SwitchTransformersTop1Router(self.config)
-        seq_len = 128
-        batch_size = 4
-        hidden_states = ops.stack(batch_size * [ops.rand((seq_len, self.config.hidden_size))])
-
-        router_probs, router_logits = model._compute_router_probabilities(hidden_states)
-        expert_index = ops.argmax(router_probs, dim=-1)
-        expert_index = nn.functional.one_hot(expert_index, num_classes=self.config.num_experts)
-
-        token_priority = ops.cumsum(expert_index, dim=-2)
-        expert_capacity_mask = token_priority <= self.config.expert_capacity
-        expert_index = expert_index * expert_capacity_mask
-
-        assert ops.sum(expert_index) <= batch_size * self.config.num_experts * self.config.expert_capacity
-
-
-@slow
-@require_mindspore
-@require_tokenizers
-class SwitchTransformerModelIntegrationTests(unittest.TestCase):
-    def test_small_logits(self):
-        r"""
-        Logits testing to check implementation consistency between `t5x` implementation
-        and `transformers` implementation of Switch-C transformers. We only check the logits
-        of the first batch.
-        """
-        model = SwitchTransformersModel.from_pretrained("google/switch-base-8", ms_dtype=mindspore.bfloat16)
-        input_ids = ops.ones((32, 64), dtype=mindspore.int64)
-        decoder_input_ids = ops.ones((32, 64), dtype=mindspore.int64)
-
-        # fmt: off
-        EXPECTED_MEAN_LOGITS = mindspore.Tensor(
-            [
-                -0.204102, -0.193359, 0.523438, -0.296875, 0.108887,
-                0.0211182, 0.605469, -0.100586, -0.0551758, 0.296875,
-                0.0090332, 0.174805, 0.139648, -0.170898, -0.0981445,
-                0.0245361, 0.0373535, 0.050293, -0.212891, 0.129883,
-                0.390625, -0.203125, -0.122559, -0.180664, 0.0437012,
-                -0.349609, -0.0250244, -0.104004, -0.15918, -0.133789
-            ]
-        ).to(mindspore.bfloat16)
-        # fmt: on
-        hf_logits = model(input_ids, decoder_input_ids=decoder_input_ids).last_hidden_state.cpu()
-        hf_logits = hf_logits[0, 0, :30]
-
-        assert ops.allclose(hf_logits, EXPECTED_MEAN_LOGITS, rtol=6e-3, atol=9e-3)
-
-    @unittest.skip(
-        "Unless we stop stripping left and right by default for all special tokens, the expected ids obtained here will not match the original ones. Wait for https://github.com/huggingface/transformers/pull/23909 to be merged"
-    )
-    def test_small_generate(self):
-        # Generate test using the smalled switch-C model.
-
-        model = SwitchTransformersForConditionalGeneration.from_pretrained(
-            "google/switch-base-8", ms_dtype=mindspore.bfloat16
-        ).eval()
-        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small", use_fast=False, legacy=False)
-
-        input_ids = tokenizer(
-            "The human walks into a bar and orders a <extra_id_0>", return_tensors="ms"
-        ).input_ids
-        sequences = model.generate(input_ids)
-        output_str = tokenizer.batch_decode(sequences, skip_special_tokens=True)[0]
-        self.assertEqual(output_str, "drink.")
-
-        input_ids = tokenizer(
-            "A <extra_id_0> walks into a bar and orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>.",
-            return_tensors="ms",
-        ).input_ids
-        sequences = model.generate(input_ids)
-        output_str = tokenizer.batch_decode(sequences, skip_special_tokens=False)[0]
-
-        EXPECTED_OUTPUT = "<pad><extra_id_0> man<extra_id_1> beer<extra_id_2> a<extra_id_3> whiskey<extra_id_4>.</s>"
-        self.assertEqual(output_str, EXPECTED_OUTPUT)
-
-    @unittest.skip(
-        "Unless we stop stripping left and right by default for all special tokens, the expected ids obtained here will not match the original ones. Wait for https://github.com/huggingface/transformers/pull/23909 to be merged"
-    )
-    def test_small_batch_generate(self):
-        BATCH_SIZE = 4
-        model = SwitchTransformersForConditionalGeneration.from_pretrained(
-            "google/switch-base-8", ms_dtype=mindspore.bfloat16
-        ).eval()
-        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small", use_fast=False, legacy=False)
-
-        inputs = [
-            "A <extra_id_0> walks into a bar and orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>."
-        ] * BATCH_SIZE
-        encoded_input = tokenizer.batch_encode_plus(inputs, return_tensors="ms")
-
-        sequences = model.generate(**encoded_input)
-        batch_output = tokenizer.batch_decode(sequences, skip_special_tokens=False)
-
-        for i in range(0, BATCH_SIZE, 2):
-            self.assertEqual(batch_output[i], batch_output[i + 1])
\ No newline at end of file
diff --git a/tests/transformers/models/t5/__init__.py b/tests/transformers/models/t5/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/t5/test_modeling_t5.py b/tests/transformers/models/t5/test_modeling_t5.py
deleted file mode 100644
index f342ea513..000000000
--- a/tests/transformers/models/t5/test_modeling_t5.py
+++ /dev/null
@@ -1,1517 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import copy
-import os
-import pickle
-import tempfile
-import unittest
-
-from mindnlp.transformers import T5Config, is_mindspore_available
-from mindnlp.transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-from mindnlp.utils import cached_property
-from mindnlp.configs import SUPPORT_BF16
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, nn, no_grad
-    from mindnlp.engine import set_seed
-
-    from mindnlp.transformers import (
-        AutoTokenizer,
-        ByT5Tokenizer,
-        T5EncoderModel,
-        T5ForConditionalGeneration,
-        T5ForQuestionAnswering,
-        T5ForSequenceClassification,
-        T5ForTokenClassification,
-        T5Model,
-        T5Tokenizer,
-    )
-
-
-class T5ModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        decoder_seq_length=7,
-        # For common tests
-        is_training=True,
-        use_attention_mask=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        eos_token_id=1,
-        pad_token_id=0,
-        decoder_start_token_id=0,
-        scope=None,
-        decoder_layers=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.scope = None
-        self.decoder_layers = decoder_layers
-
-    def get_large_model_config(self):
-        return T5Config.from_pretrained("google-t5/t5-base")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size).clamp(2)
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        decoder_attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        )
-
-    def get_pipeline_config(self):
-        return T5Config(
-            vocab_size=166,  # t5 forces 100 extra tokens
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-        )
-
-    def get_config(self):
-        return T5Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-        )
-
-    def check_prepare_lm_labels_via_shift_left(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5Model(config=config)
-        model.eval()
-
-        # make sure that lm_labels are correctly padded from the right
-        lm_labels = lm_labels.masked_fill((lm_labels == self.decoder_start_token_id), self.eos_token_id)
-
-        # add casaul pad token mask
-        triangular_mask = ops.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
-        lm_labels = lm_labels.masked_fill(triangular_mask, self.pad_token_id)
-        decoder_input_ids = model._shift_right(lm_labels)
-
-        for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
-            # first item
-            self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
-            if i < decoder_input_ids_slice.shape[-1]:
-                if i < decoder_input_ids.shape[-1] - 1:
-                    # items before diagonal
-                    self.parent.assertListEqual(
-                        decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
-                    )
-                # pad items after diagonal
-                if i < decoder_input_ids.shape[-1] - 2:
-                    self.parent.assertListEqual(
-                        decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
-                    )
-            else:
-                # all items after square
-                self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5Model(config=config)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        decoder_output = result.last_hidden_state
-        decoder_past = result.past_key_values
-        encoder_output = result.encoder_last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-        self.parent.assertEqual(decoder_output.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size))
-        # There should be `num_layers` key value embeddings stored in decoder_past
-        self.parent.assertEqual(len(decoder_past), config.num_layers)
-        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
-        self.parent.assertEqual(len(decoder_past[0]), 4)
-
-    def create_and_check_with_lm_head(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5ForConditionalGeneration(config=config).eval()
-        outputs = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            labels=lm_labels,
-        )
-        self.parent.assertEqual(len(outputs), 4)
-        self.parent.assertEqual(outputs["logits"].shape, (self.batch_size, self.decoder_seq_length, self.vocab_size))
-        self.parent.assertEqual(outputs["loss"].shape, ())
-
-    def create_and_check_with_sequence_classification_head(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        labels = mindspore.tensor([1] * self.batch_size, dtype=mindspore.int64)
-        model = T5ForSequenceClassification(config=config).eval()
-        outputs = model(
-            input_ids=input_ids,
-            decoder_input_ids=input_ids,
-            labels=labels,
-        )
-        # self.parent.assertEqual(len(outputs), 4)
-        self.parent.assertEqual(outputs["logits"].shape, (self.batch_size, config.num_labels))
-        self.parent.assertEqual(outputs["loss"].shape, ())
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5Model(config=config).get_decoder().eval()
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_decoder_model_attention_mask_past(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5Model(config=config).get_decoder()
-        model.eval()
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        output, past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True).to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5Model(config=config).get_decoder().eval()
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_generate_with_past_key_values(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5ForConditionalGeneration(config=config).eval()
-        set_seed(0)
-        output_without_past_cache = model.generate(
-            input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
-        )
-        set_seed(0)
-        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
-        self.parent.assertTrue(ops.all(output_with_past_cache == output_without_past_cache))
-
-    def create_and_check_model_fp16_forward(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = T5Model(config=config).half().eval()
-        output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"]
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-    def create_and_check_encoder_decoder_shared_weights(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        for model_class in [T5Model, T5ForConditionalGeneration]:
-            set_seed(0)
-            model = model_class(config=config).eval()
-            # load state dict copies weights but does not tie them
-            model.encoder.load_state_dict(model.decoder.state_dict(), strict=False)
-
-            set_seed(0)
-            tied_config = copy.deepcopy(config)
-            tied_config.tie_encoder_decoder = True
-            tied_model = model_class(config=tied_config).eval()
-
-            model_result = model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            tied_model_result = tied_model(
-                input_ids=input_ids,
-                decoder_input_ids=decoder_input_ids,
-                attention_mask=attention_mask,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-
-            # check that models has less parameters
-            self.parent.assertLess(
-                sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
-            )
-            random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
-
-            # check that outputs are equal
-            self.parent.assertTrue(
-                ops.allclose(
-                    model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
-                )
-            )
-
-            # check that outputs after saving and loading are equal
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tied_model.save_pretrained(tmpdirname)
-                tied_model = model_class.from_pretrained(tmpdirname)
-                tied_model.eval()
-
-                # check that models has less parameters
-                self.parent.assertLess(
-                    sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
-                )
-                random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()
-
-                tied_model_result = tied_model(
-                    input_ids=input_ids,
-                    decoder_input_ids=decoder_input_ids,
-                    attention_mask=attention_mask,
-                    decoder_attention_mask=decoder_attention_mask,
-                )
-
-                # check that outputs are equal
-                self.parent.assertTrue(
-                    ops.allclose(
-                        model_result[0][0, :, random_slice_idx],
-                        tied_model_result[0][0, :, random_slice_idx],
-                        atol=1e-4,
-                    )
-                )
-
-    def check_resize_embeddings_t5_v1_1(
-        self,
-        config,
-    ):
-        prev_vocab_size = config.vocab_size
-
-        config.tie_word_embeddings = False
-        model = T5ForConditionalGeneration(config=config).eval()
-        model.resize_token_embeddings(prev_vocab_size - 10)
-
-        self.parent.assertEqual(model.get_input_embeddings().weight.shape[0], prev_vocab_size - 10)
-        self.parent.assertEqual(model.get_output_embeddings().weight.shape[0], prev_vocab_size - 10)
-        self.parent.assertEqual(model.config.vocab_size, prev_vocab_size - 10)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "use_cache": False,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (T5Model, T5ForConditionalGeneration, T5ForSequenceClassification, T5ForQuestionAnswering)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (T5ForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": T5Model,
-            "question-answering": T5ForQuestionAnswering,
-            "summarization": T5ForConditionalGeneration,
-            "text-classification": T5ForSequenceClassification,
-            "text2text-generation": T5ForConditionalGeneration,
-            "translation": T5ForConditionalGeneration,
-            "zero-shot": T5ForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    all_parallelizable_model_classes = (T5Model, T5ForConditionalGeneration) if is_mindspore_available() else ()
-    fx_compatible = True
-    test_pruning = False
-    test_resize_embeddings = True
-    test_model_parallel = True
-    is_encoder_decoder = True
-    # The small T5 model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = T5ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
-
-    # `QAPipelineTests` is not working well with slow tokenizers (for some models) and we don't want to touch the file
-    # `src/transformers/data/processors/squad.py` (where this test fails for this model)
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_case_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if tokenizer_name is None:
-            return True
-        if pipeline_test_case_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
-            return True
-
-        return False
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_shift_right(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_v1_1(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        # check that gated gelu feed forward and different word embeddings work
-        config = config_and_inputs[0]
-        config.tie_word_embeddings = False
-        config.feed_forward_proj = "gated-gelu"
-        self.model_tester.create_and_check_model(config, *config_and_inputs[1:])
-
-    # T5ForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (T5Model, T5ForConditionalGeneration, T5ForQuestionAnswering):
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with no_grad():
-                model(**inputs)[0]
-
-    def test_config_and_model_silu_gated(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config = config_and_inputs[0]
-        config.feed_forward_proj = "gated-silu"
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_with_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
-
-    def test_with_sequence_classification_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_sequence_classification_head(*config_and_inputs)
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    def test_decoder_model_past_with_attn_mask(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs)
-
-    def test_decoder_model_past_with_3d_attn_mask(self):
-        (
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = self.model_tester.prepare_config_and_inputs()
-
-        attention_mask = ids_tensor(
-            [self.model_tester.batch_size, self.model_tester.encoder_seq_length, self.model_tester.encoder_seq_length],
-            vocab_size=2,
-        )
-        decoder_attention_mask = ids_tensor(
-            [self.model_tester.batch_size, self.model_tester.decoder_seq_length, self.model_tester.decoder_seq_length],
-            vocab_size=2,
-        )
-
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(
-            config,
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        )
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_generate_with_past_key_values(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_generate_with_past_key_values(*config_and_inputs)
-
-    def test_encoder_decoder_shared_weights(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
-
-    def test_model_fp16_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
-
-    def test_v1_1_resize_embeddings(self):
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        self.model_tester.check_resize_embeddings_t5_v1_1(config)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google-t5/t5-small"
-        model = T5Model.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
-    def test_export_to_onnx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        model = T5Model(config_and_inputs[0])
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            ops.onnx.export(
-                model,
-                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
-                f"{tmpdirname}/t5_test.onnx",
-                export_params=True,
-                opset_version=9,
-                input_names=["input_ids", "decoder_input_ids"],
-            )
-
-    def test_generate_with_head_masking(self):
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config = config_and_inputs[0]
-        max_length = config_and_inputs[1].shape[-1] + 3
-        model = T5ForConditionalGeneration(config).eval()
-
-        head_masking = {
-            "head_mask": ops.zeros(config.num_layers, config.num_heads),
-            "decoder_head_mask": ops.zeros(config.num_decoder_layers, config.num_heads),
-            "cross_attn_head_mask": ops.zeros(config.num_decoder_layers, config.num_heads),
-        }
-
-        for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
-            head_masks = {name: mask}
-            # Explicitly pass decoder_head_mask as it is required from T5 model when head_mask specified
-            if name == "head_mask":
-                head_masks["decoder_head_mask"] = ops.ones(
-                    config.num_decoder_layers, config.num_heads
-                )
-
-            out = model.generate(
-                config_and_inputs[1],
-                num_beams=1,
-                max_length=max_length,
-                output_attentions=True,
-                return_dict_in_generate=True,
-                **head_masks,
-            )
-            # We check the state of decoder_attentions and cross_attentions just from the last step
-            attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
-            self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
-
-
-class T5EncoderOnlyModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        # For common tests
-        use_attention_mask=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        is_training=False,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        is_encoder_decoder=False,
-        eos_token_id=1,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        # For common tests
-        self.seq_length = self.encoder_seq_length
-        self.use_attention_mask = use_attention_mask
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.is_encoder_decoder = is_encoder_decoder
-        self.scope = None
-        self.is_training = is_training
-
-    def get_large_model_config(self):
-        return T5Config.from_pretrained("google-t5/t5-base")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-
-        config = T5Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-    ):
-        model = T5EncoderModel(config=config)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-        )
-        result = model(input_ids=input_ids)
-        encoder_output = result.last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-
-    def create_and_check_model_fp16_forward(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-    ):
-        model = T5EncoderModel(config=config).half().eval()
-        output = model(input_ids, attention_mask=attention_mask)["last_hidden_state"]
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-    def create_and_check_with_token_classification_head(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-    ):
-        labels = mindspore.tensor([1] * self.seq_length * self.batch_size, dtype=mindspore.int64)
-        model = T5ForTokenClassification(config=config).eval()
-        outputs = model(
-            input_ids=input_ids,
-            labels=labels,
-            attention_mask=attention_mask,
-        )
-        self.parent.assertEqual(outputs["logits"].shape, (self.batch_size, self.seq_length, config.num_labels))
-        self.parent.assertEqual(outputs["loss"].shape, ())
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-class T5EncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (T5EncoderModel, T5ForTokenClassification) if is_mindspore_available() else ()
-    test_pruning = False
-    test_resize_embeddings = False
-    test_model_parallel = True
-    pipeline_model_mapping = (
-        {
-            "token-classification": T5ForTokenClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    all_parallelizable_model_classes = (T5EncoderModel,) if is_mindspore_available() else ()
-
-    def setUp(self):
-        self.model_tester = T5EncoderOnlyModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_fp16_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
-
-    def test_with_token_classification_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_token_classification_head(*config_and_inputs)
-
-
-def use_task_specific_params(model, task):
-    model.config.update(model.config.task_specific_params[task])
-
-
-@require_mindspore
-@require_tokenizers
-@slow
-class T5ModelFp16Tests(unittest.TestCase):
-    def test_fp16_fp32_conversion(self):
-        r"""
-        A test to check whether the argument `keep_in_fp32_modules` correctly does its job
-        """
-        orig_import = __import__
-        accelerate_mock = unittest.mock.Mock()
-
-        # mock import of accelerate
-        def import_accelerate_mock(name, *args, **kwargs):
-            if name == "accelerate":
-                if accelerate_available:
-                    return accelerate_mock
-                else:
-                    raise ImportError
-            return orig_import(name, *args, **kwargs)
-
-        # Load without using `accelerate`
-        with unittest.mock.patch("builtins.__import__", side_effect=import_accelerate_mock):
-            accelerate_available = False
-
-            model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", ms_dtype=mindspore.float16)
-            self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == mindspore.float32)
-            self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == mindspore.float16)
-
-            # Load without in bf16
-            if SUPPORT_BF16:
-                model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", ms_dtype=mindspore.bfloat16)
-                self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == mindspore.bfloat16)
-                self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == mindspore.bfloat16)
-
-        if SUPPORT_BF16:
-            # Load using `accelerate` in bf16
-            model = T5ForConditionalGeneration.from_pretrained(
-                "google-t5/t5-small", ms_dtype=mindspore.bfloat16
-            )
-            self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == mindspore.bfloat16)
-            self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == mindspore.bfloat16)
-
-            # Load using `accelerate` in bf16
-            model = T5ForConditionalGeneration.from_pretrained(
-                "google-t5/t5-small", ms_dtype=mindspore.bfloat16, low_cpu_mem_usage=True
-            )
-            self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == mindspore.bfloat16)
-            self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == mindspore.bfloat16)
-
-        # Load without using `accelerate`
-        model = T5ForConditionalGeneration.from_pretrained(
-            "google-t5/t5-small", ms_dtype=mindspore.float16, low_cpu_mem_usage=True
-        )
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == mindspore.float32)
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == mindspore.float16)
-
-        # Load using `accelerate`
-        model = T5ForConditionalGeneration.from_pretrained(
-            "google-t5/t5-small", ms_dtype=mindspore.float16
-        )
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == mindspore.float32)
-        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wi.weight.dtype == mindspore.float16)
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class T5ModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def model(self):
-        return T5ForConditionalGeneration.from_pretrained("google-t5/t5-base")
-
-    @cached_property
-    def tokenizer(self):
-        return T5Tokenizer.from_pretrained("google-t5/t5-base")
-
-    # @slow
-    # def test_torch_quant(self):
-    #     r"""
-    #     Test that a simple `torch.quantization.quantize_dynamic` call works on a T5 model.
-    #     """
-    #     model_name = "google/flan-t5-small"
-    #     tokenizer = T5Tokenizer.from_pretrained(model_name)
-    #     model = T5ForConditionalGeneration.from_pretrained(model_name)
-    #     model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
-    #     input_text = "Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?"
-    #     input_ids = tokenizer(input_text, return_tensors="ms").input_ids
-    #     _ = model.generate(input_ids)
-
-    @slow
-    def test_small_generation(self):
-        model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-        model.config.max_length = 8
-        model.config.num_beams = 1
-        model.config.do_sample = False
-        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-        input_ids = tokenizer("summarize: Hello there", return_tensors="ms").input_ids
-        sequences = model.generate(input_ids)
-
-        output_str = tokenizer.batch_decode(sequences, skip_special_tokens=True)[0]
-        print(output_str)
-        self.assertTrue(output_str == "Hello there!")
-
-    @slow
-    def test_small_integration_test(self):
-        """
-        For comparision run:
-        >>> import t5  # pip install t5==0.7.1
-        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
-
-        >>> path_to_mtf_small_t5_checkpoint = '<fill_in>'
-        >>> path_to_mtf_small_spm_model_path = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_t5_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_spm_model_path, extra_ids=100)
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-        tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="ms").input_ids
-        labels = tokenizer("Hi I am", return_tensors="ms").input_ids
-
-        loss = model(input_ids, labels=labels).loss
-        mtf_score = -(labels.shape[-1] * loss.item())
-
-        EXPECTED_SCORE = -19.0845
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
-
-    @slow
-    def test_small_v1_1_integration_test(self):
-        """
-        For comparision run:
-        >>> import t5  # pip install t5==0.7.1
-        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
-
-        >>> path_to_mtf_small_t5_v1_1_checkpoint = '<fill_in>'
-        >>> path_to_mtf_small_spm_model_path = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_t5_v1_1_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_spm_model_path, extra_ids=100)
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-small")
-        tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="ms").input_ids
-        labels = tokenizer("Hi I am", return_tensors="ms").input_ids
-
-        loss = model(input_ids, labels=labels).loss
-        mtf_score = -(labels.shape[-1] * loss.item())
-
-        EXPECTED_SCORE = -59.0293
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
-
-    @slow
-    def test_small_byt5_integration_test(self):
-        """
-        For comparision run:
-        >>> import t5  # pip install t5==0.9.1
-
-        >>> path_to_byt5_small_checkpoint = '<fill_in>'
-        >>> t5_model = t5.models.MtfModel(model_dir=path_to_tf_checkpoint, batch_size=1, tpu=None)
-        >>> vocab = t5.data.ByteVocabulary()
-        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
-        """
-
-        model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
-        tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
-
-        input_ids = tokenizer("Hello there", return_tensors="ms").input_ids
-        labels = tokenizer("Hi I am", return_tensors="ms").input_ids
-
-        loss = model(input_ids, labels=labels).loss
-        mtf_score = -(labels.shape[-1] * loss.item())
-
-        EXPECTED_SCORE = -60.7397
-        print(mtf_score)
-        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
-
-    @slow
-    def test_summarization(self):
-        model = self.model
-        tok = self.tokenizer
-
-        FRANCE_ARTICLE = (  # @noqa
-            "Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings"
-            " Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane."
-            ' Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation."'
-            ' He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s'
-            " comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video"
-            " showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French"
-            " Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a"
-            " phone at the wreckage site. The two publications described the supposed video, but did not post it on"
-            " their websites. The publications said that they watched the video, which was found by a source close to"
-            " the investigation. \"One can hear cries of 'My God' in several languages,\" Paris Match reported."
-            ' "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the'
-            " cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the"
-            ' screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt,'
-            " editor-in-chief of Bild online. An official with France's accident investigation agency, the BEA, said"
-            " the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman"
-            " in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the"
-            ' reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said,'
-            ' but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be'
-            " sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by"
-            " specialized technicians working hand-in-hand with investigators. But none of the cell phones found so"
-            " far have been sent to the institute, Menichini said. Asked whether staff involved in the search could"
-            ' have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin'
-            ' Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match'
-            ' are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered'
-            ' cell phones from the crash site after Bild and Paris Match published their reports. "That is something'
-            " we did not know before. ... Overall we can say many things of the investigation weren't revealed by the"
-            ' investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline'
-            " Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the"
-            " controls of Germanwings Flight 9525, which he's accused of deliberately crashing last week in the"
-            ' French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of'
-            ' severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school'
-            " discovered in an internal investigation, Lufthansa said, included medical documents he submitted in"
-            " connection with resuming his flight training. The announcement indicates that Lufthansa, the parent"
-            " company of Germanwings, knew of Lubitz's battle with depression, allowed him to continue training and"
-            " ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100%"
-            ' fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was'
-            " sharing the information and documents -- including training and medical records -- with public"
-            " prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the"
-            " past week to recover human remains and plane debris scattered across a steep mountainside. He saw the"
-            " crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash"
-            " site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late"
-            " Tuesday that no visible human remains were left at the site but recovery teams would keep searching."
-            " French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all"
-            " the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested."
-            " In the meantime, the recovery of the victims' personal belongings will start Wednesday, Menichini said."
-            " Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew"
-            " on board. Check out the latest from our correspondents . The details about Lubitz's correspondence with"
-            " the flight school during his training were among several developments as investigators continued to"
-            " delve into what caused the crash and Lubitz's possible motive for downing the jet. A Lufthansa"
-            " spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his"
-            ' examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in'
-            " Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at"
-            " some point before his aviation career and underwent psychotherapy before he got his pilot's license."
-            " Kumpa emphasized there's no evidence suggesting Lubitz was suicidal or acting aggressively before the"
-            " crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to"
-            " lose his pilot's license, a European government official briefed on the investigation told CNN on"
-            ' Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being'
-            " considered. Another source, a law enforcement official briefed on the investigation, also told CNN that"
-            " authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would"
-            " not be allowed to fly because of his medical problems. Lubitz's girlfriend told investigators he had"
-            " seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded"
-            " he had psychological issues, the European government official said. But no matter what details emerge"
-            " about his previous mental health struggles, there's more to the story, said Brian Russell, a forensic"
-            ' psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact'
-            " that maybe they weren't going to keep doing their job and they're upset about that and so they're"
-            ' suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to'
-            " also take that rage and turn it outward on 149 other people who had nothing to do with the person's"
-            ' problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight'
-            " 9525? CNN's Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura"
-            " Smith-Spark wrote from London. CNN's Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine"
-            " Amiel and Anna-Maja Rappard contributed to this report."
-        )
-        SHORTER_ARTICLE = (
-            "(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on"
-            " Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The"
-            " formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based."
-            " The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its"
-            ' jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East'
-            ' Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the'
-            " situation in Palestinian territories, paving the way for possible war crimes investigations against"
-            " Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and"
-            " the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the"
-            " body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, said it was a"
-            ' move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the'
-            ' world is also a step closer to ending a long era of impunity and injustice," he said, according to an'
-            ' ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge'
-            " Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the"
-            ' Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine'
-            " acquires all the rights as well as responsibilities that come with being a State Party to the Statute."
-            ' These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights'
-            ' Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should'
-            " immediately end their pressure, and countries that support universal acceptance of the court's treaty"
-            ' should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the'
-            " group. \"What's objectionable is the attempts to undermine international justice, not Palestine's"
-            ' decision to join a treaty to which over 100 countries around the world are members." In January, when'
-            " the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an"
-            ' outrage, saying the court was overstepping its boundaries. The United States also said it "strongly"'
-            " disagreed with the court's decision. \"As we have said repeatedly, we do not believe that Palestine is a"
-            ' state and therefore we do not believe that it is eligible to join the ICC," the State Department said in'
-            ' a statement. It urged the warring sides to resolve their differences through direct negotiations. "We'
-            ' will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace,"'
-            " it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the"
-            ' territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the'
-            " court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou"
-            ' Bensouda said her office would "conduct its analysis in full independence and impartiality." The war'
-            " between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry"
-            " will include alleged war crimes committed since June. The International Criminal Court was set up in"
-            " 2002 to prosecute genocide, crimes against humanity and war crimes. CNN's Vasco Cotovio, Kareem Khadder"
-            " and Faith Karimi contributed to this report."
-        )
-        IRAN_ARTICLE = (
-            "(CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran"
-            " in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively"
-            " block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger."
-            " Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli"
-            " Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a"
-            " letter to the Iranian leadership warning them away from a deal. The debate that has already begun since"
-            " the announcement of the new framework will likely result in more heat than light. It will not be helped"
-            " by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: ."
-            " The most misleading assertion, despite universal rejection by experts, is that the negotiations'"
-            " objective at the outset was the total elimination of any nuclear program in Iran. That is the position"
-            " of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it"
-            " had been, there would have been no Iranian team at the negotiating table. Rather, the objective has"
-            " always been to structure an agreement or series of agreements so that Iran could not covertly develop a"
-            " nuclear arsenal before the United States and its allies could respond. The new framework has exceeded"
-            " expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by"
-            " two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another"
-            " dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite"
-            " sharp accusations by some in the United States and its allies, Iran denies having such a program, and"
-            " U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's"
-            " continued cooperation with International Atomic Energy Agency inspections is further evidence on this"
-            " point, and we'll know even more about Iran's program in the coming months and years because of the deal."
-            " In fact, the inspections provisions that are part of this agreement are designed to protect against any"
-            " covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that"
-            " the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter"
-            " warning that a deal might be killed by Congress or a future president). This of course is not the case."
-            " The talks were between Iran and the five permanent members of the U.N. Security Council (United States,"
-            " United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has"
-            " played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement"
-            " reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran"
-            " and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement"
-            " contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the"
-            " case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased"
-            " or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes"
-            " Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear"
-            " sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going"
-            " forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such"
-            " a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the"
-            ' agreement should be a formal treaty requiring the Senate to "advise and consent." But the issue is not'
-            " suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New"
-            " START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement"
-            " with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement"
-            " will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove"
-            " most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally"
-            " some insist that any agreement must address Iranian missile programs, human rights violations or support"
-            " for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are"
-            " unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in"
-            " the negotiations would be a poison pill. This agreement should be judged on its merits and on how it"
-            " affects the security of our negotiating partners and allies, including Israel. Those judgments should be"
-            " fact-based, not based on questionable assertions or dubious assumptions."
-        )
-        ARTICLE_SUBWAY = (
-            "New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
-            " year later, she got married again in Westchester County, but to a different man and without divorcing"
-            " her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos"
-            ' declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married'
-            " once more, this time in the Bronx. In an application for a marriage license, she stated it was her"
-            ' "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false'
-            ' instrument for filing in the first degree," referring to her false statements on the 2010 marriage'
-            " license application, according to court documents. Prosecutors said the marriages were part of an"
-            " immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to"
-            " her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was"
-            " arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New"
-            " York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total,"
-            " Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All"
-            " occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be"
-            " married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors"
-            " said the immigration scam involved some of her husbands, who filed for permanent residence status"
-            " shortly after the marriages.  Any divorces happened only after such filings were approved. It was"
-            " unclear whether any of the men will be prosecuted. The case was referred to the Bronx District"
-            " Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's"
-            ' Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt,'
-            " Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his"
-            " native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces"
-            " up to four years in prison.  Her next court appearance is scheduled for May 18."
-        )
-
-        expected_summaries = [
-            'prosecutor: "so far no videos were used in the crash investigation" two magazines claim to have found a'
-            " cell phone video of the final seconds . \"one can hear cries of 'My God' in several languages,\" one"
-            " magazine says .",
-            "the formal accession was marked by a ceremony at The Hague, in the Netherlands . the ICC opened a"
-            " preliminary examination into the situation in the occupied Palestinian territory . as members of the"
-            " court, Palestinians may be subject to counter-charges as well .",
-            "the u.s. and its negotiating partners reached a very strong framework agreement with Iran . aaron miller:"
-            " the debate that has already begun since the announcement of the new framework will likely result in more"
-            " heat than light . the deal would reduce Iran's low-enriched uranium stockpile, cut centrifuges and"
-            " implement a rigorous inspection regime .",
-            "prosecutors say the marriages were part of an immigration scam . if convicted, barrientos faces two"
-            ' criminal counts of "offering a false instrument for filing in the first degree" she has been married 10'
-            " times, with nine of her marriages occurring between 1999 and 2002 .",
-        ]
-
-        use_task_specific_params(model, "summarization")
-
-        dct = tok(
-            [model.config.prefix + x for x in [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]],
-            padding="max_length",
-            max_length=512,
-            truncation=True,
-            return_tensors="ms",
-        )
-        self.assertEqual(512, dct["input_ids"].shape[1])
-
-        hypotheses_batch = model.generate(
-            **dct,
-            num_beams=4,
-            length_penalty=2.0,
-            max_length=142,
-            min_length=56,
-            no_repeat_ngram_size=3,
-            do_sample=False,
-            early_stopping=True,
-        )
-
-        decoded = tok.batch_decode(hypotheses_batch, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        self.assertListEqual(
-            expected_summaries,
-            decoded,
-        )
-
-    @slow
-    def test_translation_en_to_de(self):
-        model = self.model
-        tok = self.tokenizer
-        use_task_specific_params(model, "translation_en_to_de")
-
-        en_text = '"Luigi often said to me that he never wanted the brothers to end up in court", she wrote.'
-        expected_translation = (
-            '"Luigi sagte mir oft, dass er nie wollte, dass die Brüder am Gericht sitzen", schrieb sie.'
-        )
-
-        input_ids = tok.encode(model.config.prefix + en_text, return_tensors="ms")
-        input_ids = input_ids
-        output = model.generate(input_ids)
-        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        self.assertEqual(translation, expected_translation)
-
-    @slow
-    def test_translation_en_to_fr(self):
-        model = self.model  # google-t5/t5-base
-        tok = self.tokenizer
-        use_task_specific_params(model, "translation_en_to_fr")
-
-        en_text = (
-            ' This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of'
-            " countless generations of stars: the oldest stars are seen as blue dots. "
-        )
-
-        input_ids = tok.encode(model.config.prefix + en_text, return_tensors="ms")
-        input_ids = input_ids
-
-        output = model.generate(
-            input_ids=input_ids,
-            num_beams=4,
-            length_penalty=2.0,
-            max_length=100,
-            no_repeat_ngram_size=3,
-            do_sample=False,
-            early_stopping=True,
-        )
-        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        new_truncated_translation = (
-            "Cette section d'images provenant de l'enregistrement infrarouge effectué par le télescope Spitzer montre "
-            "un "
-            "« portrait familial » de générations innombrables d’étoiles : les plus anciennes sont observées "
-            "sous forme "
-            "de points bleus."
-        )
-
-        self.assertEqual(translation, new_truncated_translation)
-
-    @slow
-    def test_translation_en_to_ro(self):
-        model = self.model
-        tok = self.tokenizer
-        use_task_specific_params(model, "translation_en_to_ro")
-        en_text = "Taco Bell said it plans to add 2,000 locations in the US by 2022."
-        expected_translation = "Taco Bell a declarat că intenţionează să adauge 2 000 de locaţii în SUA până în 2022."
-
-        inputs = tok(model.config.prefix + en_text, return_tensors="ms")
-        output = model.generate(**inputs)
-        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        self.assertEqual(translation, expected_translation)
-
-    @slow
-    def test_contrastive_search_t5(self):
-        article = (
-            " New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
-            " year later, she got married again in Westchester County, but to a different man and without divorcing"
-            " her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos"
-            ' declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married'
-            " once more, this time in the Bronx. In an application for a marriage license, she stated it was her"
-            ' "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false'
-            ' instrument for filing in the first degree," referring to her false statements on the 2010 marriage'
-            " license application, according to court documents. Prosecutors said the marriages were part of an"
-            " immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to"
-            " her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was"
-            " arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New"
-            " York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total,"
-            " Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All"
-            " occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be"
-            " married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors"
-            " said the immigration scam involved some of her husbands, who filed for permanent residence status"
-            " shortly after the marriages.  Any divorces happened only after such filings were approved. It was"
-            " unclear whether any of the men will be prosecuted. The case was referred to the Bronx District"
-            " Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's"
-            ' Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt,'
-            " Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his"
-            " native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces"
-            " up to four years in prison.  Her next court appearance is scheduled for May 18."
-        )
-        article = "summarize: " + article.strip()
-        t5_tokenizer = AutoTokenizer.from_pretrained("flax-community/t5-base-cnn-dm")
-        t5_model = T5ForConditionalGeneration.from_pretrained("flax-community/t5-base-cnn-dm")
-        input_ids = t5_tokenizer(
-            article, add_special_tokens=False, truncation=True, max_length=512, return_tensors="ms"
-        ).input_ids
-
-        outputs = t5_model.generate(input_ids, penalty_alpha=0.5, top_k=5, max_length=64)
-        generated_text = t5_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        self.assertListEqual(
-            generated_text,
-            [
-                "Liana Barrientos has been married 10 times, nine of them in the Bronx. Her husbands filed for "
-                "permanent residence after the marriages, prosecutors say."
-            ],
-        )
-
-
-    @slow
-    def test_translation_inference_time(self):
-        model = self.model  # google-t5/t5-base
-        tok = self.tokenizer
-        use_task_specific_params(model, "translation_en_to_fr")
-
-        en_text = (
-            ' This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of'
-            " countless generations of stars: the oldest stars are seen as blue dots. "
-        )
-
-        input_ids = tok.encode(model.config.prefix + en_text, return_tensors="ms")
-        input_ids = input_ids
-
-        output = model.generate(
-            input_ids=input_ids,
-            max_new_tokens=50,
-            do_sample=False,
-        )
-        print(output)
-
-
-@require_mindspore
-class TestAsymmetricT5(unittest.TestCase):
-    def build_model_and_check_forward_pass(self, **kwargs):
-        tester = T5ModelTester(self, **kwargs)
-        config, *inputs = tester.prepare_config_and_inputs()
-        (
-            input_ids,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = inputs
-        model = T5ForConditionalGeneration(config=config).eval()
-        outputs = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            labels=lm_labels,
-        )
-        # outputs = model(*inputs)
-        assert len(outputs) == 4
-        assert outputs["logits"].shape == (tester.batch_size, tester.decoder_seq_length, tester.vocab_size)
-        assert outputs["loss"].shape == ()
-        return model
-
-    def test_small_decoder(self):
-        # num_hidden_layers is passed to T5Config as num_layers
-        model = self.build_model_and_check_forward_pass(decoder_layers=1, num_hidden_layers=2)
-        assert len(model.encoder.block) == 2
-        assert len(model.decoder.block) == 1
-
-    def test_defaulting_to_symmetry(self):
-        # num_hidden_layers is passed to T5Config as num_layers
-        model = self.build_model_and_check_forward_pass(num_hidden_layers=2)
-        assert len(model.decoder.block) == len(model.encoder.block) == 2
\ No newline at end of file
diff --git a/tests/transformers/models/tapas/__init__.py b/tests/transformers/models/tapas/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/tapas/test_modeling_tapas.py b/tests/transformers/models/tapas/test_modeling_tapas.py
deleted file mode 100644
index 782a16965..000000000
--- a/tests/transformers/models/tapas/test_modeling_tapas.py
+++ /dev/null
@@ -1,1084 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import unittest
-
-import numpy as np
-import pandas as pd
-from mindnlp.transformers import (
-    MODEL_FOR_CAUSAL_LM_MAPPING,
-    MODEL_FOR_MASKED_LM_MAPPING,
-    MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
-    MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
-    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-    MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
-    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-    TapasConfig,
-)
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import require_mindspore, slow
-from mindnlp.utils import cached_property, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        TapasForMaskedLM,
-        TapasForQuestionAnswering,
-        TapasForSequenceClassification,
-        TapasModel,
-        TapasTokenizer,
-    )
-    from mindnlp.transformers.models.tapas.modeling_tapas import (
-        IndexMap,
-        ProductIndexMap,
-        flatten,
-        gather,
-        range_index_map,
-        reduce_max,
-        reduce_mean,
-        reduce_sum,
-    )
-
-class TapasModelTester:
-    """You can also import this e.g from .test_modeling_tapas import TapasModelTester"""
-
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        initializer_range=0.02,
-        max_position_embeddings=512,
-        type_vocab_sizes=[3, 256, 256, 2, 256, 256, 10],
-        type_sequence_label_size=2,
-        positive_weight=10.0,
-        num_aggregation_labels=4,
-        num_labels=2,
-        aggregation_loss_importance=0.8,
-        use_answer_as_supervision=True,
-        answer_loss_importance=0.001,
-        use_normalized_answer_loss=False,
-        huber_loss_delta=25.0,
-        temperature=1.0,
-        agg_temperature=1.0,
-        use_gumbel_for_cells=False,
-        use_gumbel_for_agg=False,
-        average_approximation_function="ratio",
-        cell_selection_preference=0.5,
-        answer_loss_cutoff=100,
-        max_num_rows=64,
-        max_num_columns=32,
-        average_logits_per_cell=True,
-        select_one_column=True,
-        allow_empty_column_selection=False,
-        init_cell_selection_weights_to_zero=True,
-        reset_position_index_per_cell=True,
-        disable_per_token_loss=False,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_sizes = type_vocab_sizes
-        self.type_sequence_label_size = type_sequence_label_size
-        self.positive_weight = positive_weight
-        self.num_aggregation_labels = num_aggregation_labels
-        self.num_labels = num_labels
-        self.aggregation_loss_importance = aggregation_loss_importance
-        self.use_answer_as_supervision = use_answer_as_supervision
-        self.answer_loss_importance = answer_loss_importance
-        self.use_normalized_answer_loss = use_normalized_answer_loss
-        self.huber_loss_delta = huber_loss_delta
-        self.temperature = temperature
-        self.agg_temperature = agg_temperature
-        self.use_gumbel_for_cells = use_gumbel_for_cells
-        self.use_gumbel_for_agg = use_gumbel_for_agg
-        self.average_approximation_function = average_approximation_function
-        self.cell_selection_preference = cell_selection_preference
-        self.answer_loss_cutoff = answer_loss_cutoff
-        self.max_num_rows = max_num_rows
-        self.max_num_columns = max_num_columns
-        self.average_logits_per_cell = average_logits_per_cell
-        self.select_one_column = select_one_column
-        self.allow_empty_column_selection = allow_empty_column_selection
-        self.init_cell_selection_weights_to_zero = init_cell_selection_weights_to_zero
-        self.reset_position_index_per_cell = reset_position_index_per_cell
-        self.disable_per_token_loss = disable_per_token_loss
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = []
-        for type_vocab_size in self.type_vocab_sizes:
-            token_type_ids.append(ids_tensor(shape=[self.batch_size, self.seq_length], vocab_size=type_vocab_size))
-        token_type_ids = ops.stack(token_type_ids, dim=2)
-
-        sequence_labels = None
-        token_labels = None
-        labels = None
-        numeric_values = None
-        numeric_values_scale = None
-        float_answer = None
-        aggregation_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            labels = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-            numeric_values = floats_tensor([self.batch_size, self.seq_length])
-            numeric_values_scale = floats_tensor([self.batch_size, self.seq_length])
-            float_answer = floats_tensor([self.batch_size])
-            aggregation_labels = ids_tensor([self.batch_size], self.num_aggregation_labels)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            input_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            labels,
-            numeric_values,
-            numeric_values_scale,
-            float_answer,
-            aggregation_labels,
-        )
-
-    def get_config(self):
-        return TapasConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_sizes=self.type_vocab_sizes,
-            initializer_range=self.initializer_range,
-            positive_weight=self.positive_weight,
-            num_aggregation_labels=self.num_aggregation_labels,
-            num_labels=self.num_labels,
-            aggregation_loss_importance=self.aggregation_loss_importance,
-            use_answer_as_supervision=self.use_answer_as_supervision,
-            answer_loss_importance=self.answer_loss_importance,
-            use_normalized_answer_loss=self.use_normalized_answer_loss,
-            huber_loss_delta=self.huber_loss_delta,
-            temperature=self.temperature,
-            agg_temperature=self.agg_temperature,
-            use_gumbel_for_cells=self.use_gumbel_for_cells,
-            use_gumbel_for_agg=self.use_gumbel_for_agg,
-            average_approximation_function=self.average_approximation_function,
-            cell_selection_preference=self.cell_selection_preference,
-            answer_loss_cutoff=self.answer_loss_cutoff,
-            max_num_rows=self.max_num_rows,
-            max_num_columns=self.max_num_columns,
-            average_logits_per_cell=self.average_logits_per_cell,
-            select_one_column=self.select_one_column,
-            allow_empty_column_selection=self.allow_empty_column_selection,
-            init_cell_selection_weights_to_zero=self.init_cell_selection_weights_to_zero,
-            reset_position_index_per_cell=self.reset_position_index_per_cell,
-            disable_per_token_loss=self.disable_per_token_loss,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        labels,
-        numeric_values,
-        numeric_values_scale,
-        float_answer,
-        aggregation_labels,
-    ):
-        model = TapasModel(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_masked_lm(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        labels,
-        numeric_values,
-        numeric_values_scale,
-        float_answer,
-        aggregation_labels,
-    ):
-        model = TapasForMaskedLM(config=config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_question_answering(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        labels,
-        numeric_values,
-        numeric_values_scale,
-        float_answer,
-        aggregation_labels,
-    ):
-        # inference: without aggregation head (SQA). Model only returns logits
-        sqa_config = copy.copy(config)
-        sqa_config.num_aggregation_labels = 0
-        sqa_config.use_answer_as_supervision = False
-        model = TapasForQuestionAnswering(config=sqa_config)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
-
-        # inference: with aggregation head (WTQ, WikiSQL-supervised). Model returns logits and aggregation logits
-        model = TapasForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels))
-
-        # training: can happen in 3 main ways
-        # case 1: conversational (SQA)
-        model = TapasForQuestionAnswering(config=sqa_config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=labels,
-        )
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
-
-        # case 2: weak supervision for aggregation (WTQ)
-        model = TapasForQuestionAnswering(config=config)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=labels,
-            numeric_values=numeric_values,
-            numeric_values_scale=numeric_values_scale,
-            float_answer=float_answer,
-        )
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels))
-
-        # case 3: strong supervision for aggregation (WikiSQL-supervised)
-        wikisql_config = copy.copy(config)
-        wikisql_config.use_answer_as_supervision = False
-        model = TapasForQuestionAnswering(config=wikisql_config)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=labels,
-            aggregation_labels=aggregation_labels,
-        )
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels))
-
-    def create_and_check_for_sequence_classification(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        token_type_ids,
-        sequence_labels,
-        token_labels,
-        labels,
-        numeric_values,
-        numeric_values_scale,
-        float_answer,
-        aggregation_labels,
-    ):
-        config.num_labels = self.num_labels
-        model = TapasForSequenceClassification(config)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            input_mask,
-            token_type_ids,
-            sequence_labels,
-            token_labels,
-            labels,
-            numeric_values,
-            numeric_values_scale,
-            float_answer,
-            aggregation_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class TapasModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            TapasModel,
-            TapasForMaskedLM,
-            # TapasForQuestionAnswering,
-            TapasForSequenceClassification,
-        )
-        if is_mindspore_available()
-        else None
-    )
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TapasModel,
-            "fill-mask": TapasForMaskedLM,
-            "table-question-answering": TapasForQuestionAnswering,
-            "text-classification": TapasForSequenceClassification,
-            "zero-shot": TapasForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_resize_embeddings = True
-    test_head_masking = False
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-        if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-            inputs_dict = {
-                k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1)
-                if isinstance(v, mindspore.Tensor) and v.ndim > 1
-                else v
-                for k, v in inputs_dict.items()
-            }
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
-                inputs_dict["labels"] = ops.ones(self.model_tester.batch_size, dtype=mindspore.int64)
-            elif model_class in get_values(MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING):
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int64
-                )
-                inputs_dict["aggregation_labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-                inputs_dict["numeric_values"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=mindspore.float32,
-                )
-                inputs_dict["numeric_values_scale"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length),
-                    dtype=mindspore.float32,
-                )
-                inputs_dict["float_answer"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.float32
-                )
-            elif model_class in [
-                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
-                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
-            ]:
-                inputs_dict["labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-            elif model_class in [
-                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
-                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
-                *get_values(MODEL_FOR_MASKED_LM_MAPPING),
-                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
-            ]:
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int64
-                )
-        return inputs_dict
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        return True
-
-    def setUp(self):
-        self.model_tester = TapasModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TapasConfig, dim=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    @unittest.skip('has traing errors')
-    def test_training(self):
-        pass
-
-def prepare_tapas_single_inputs_for_inference():
-    # Here we prepare a single table-question pair to test TAPAS inference on:
-    data = {
-        "Footballer": ["Lionel Messi", "Cristiano Ronaldo"],
-        "Age": ["33", "35"],
-    }
-    queries = "Which footballer is 33 years old?"
-    table = pd.DataFrame.from_dict(data)
-
-    return table, queries
-
-
-def prepare_tapas_batch_inputs_for_inference():
-    # Here we prepare a batch of 2 table-question pairs to test TAPAS inference on:
-    data = {
-        "Footballer": ["Lionel Messi", "Cristiano Ronaldo"],
-        "Age": ["33", "35"],
-        "Number of goals": ["712", "750"],
-    }
-    queries = ["Which footballer is 33 years old?", "How many goals does Ronaldo have?"]
-    table = pd.DataFrame.from_dict(data)
-
-    return table, queries
-
-
-def prepare_tapas_batch_inputs_for_training():
-    # Here we prepare a DIFFERENT batch of 2 table-question pairs to test TAPAS training on:
-    data = {
-        "Footballer": ["Lionel Messi", "Cristiano Ronaldo"],
-        "Age": ["33", "35"],
-        "Number of goals": ["712", "750"],
-    }
-    queries = ["Which footballer is 33 years old?", "What's the total number of goals?"]
-    table = pd.DataFrame.from_dict(data)
-
-    answer_coordinates = [[(0, 0)], [(0, 2), (1, 2)]]
-    answer_text = [["Lionel Messi"], ["1462"]]
-    float_answer = [float("NaN"), float("1462")]
-
-    return table, queries, answer_coordinates, answer_text, float_answer
-
-
-@require_mindspore
-class TapasModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_tokenizer(self):
-        return TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
-
-    @slow
-    def test_inference_no_head(self):
-        # ideally we want to test this with the weights of tapas_inter_masklm_base_reset,
-        # but since it's not straightforward to do this with the TF 1 implementation, we test it with
-        # the weights of the WTQ base model (i.e. tapas_wtq_wikisql_sqa_inter_masklm_base_reset)
-        model = TapasModel.from_pretrained("google/tapas-base-finetuned-wtq")
-
-        tokenizer = self.default_tokenizer
-        table, queries = prepare_tapas_single_inputs_for_inference()
-        inputs = tokenizer(table=table, queries=queries, return_tensors="ms")
-        inputs = {k: v for k, v in inputs.items()}
-        with no_grad():
-            outputs = model(**inputs)
-        # test the sequence output
-        expected_slice = mindspore.tensor(
-            [
-                [
-                    [-0.141581565, -0.599805772, 0.747186482],
-                    [-0.143664181, -0.602008104, 0.749218345],
-                    [-0.15169853, -0.603363097, 0.741370678],
-                ]
-            ],
-        )
-
-        self.assertTrue(ops.allclose(outputs.last_hidden_state[:, :3, :3], expected_slice, atol=0.0005))
-
-        # test the pooled output
-        expected_slice = mindspore.tensor([[0.987518311, -0.970520139, -0.994303405]])
-
-        self.assertTrue(ops.allclose(outputs.pooler_output[:, :3], expected_slice, atol=0.0005))
-
-    @unittest.skip(reason="Model not available yet")
-    def test_inference_masked_lm(self):
-        pass
-
-    # TapasForQuestionAnswering has 3 possible ways of being fine-tuned:
-    # - conversational set-up (SQA)
-    # - weak supervision for aggregation (WTQ, WikiSQL)
-    # - strong supervision for aggregation (WikiSQL-supervised)
-    # We test all of them:
-    @slow
-    def test_inference_question_answering_head_conversational(self):
-        # note that google/tapas-base-finetuned-sqa should correspond to tapas_sqa_inter_masklm_base_reset
-        model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-sqa")
-
-        tokenizer = self.default_tokenizer
-        table, queries = prepare_tapas_single_inputs_for_inference()
-        inputs = tokenizer(table=table, queries=queries, return_tensors="ms")
-        inputs = {k: v for k, v in inputs.items()}
-        with no_grad():
-            outputs = model(**inputs)
-        # test the logits
-        logits = outputs.logits
-        expected_shape = (1, 21)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_tensor = mindspore.tensor(
-            [
-                [
-                    -9997.22461,
-                    -9997.22461,
-                    -9997.22461,
-                    -9997.22461,
-                    -9997.22461,
-                    -9997.22461,
-                    -9997.22461,
-                    -9997.22461,
-                    -9997.22461,
-                    -16.2628059,
-                    -10004.082,
-                    15.4330549,
-                    15.4330549,
-                    15.4330549,
-                    -9990.42,
-                    -16.3270779,
-                    -16.3270779,
-                    -16.3270779,
-                    -16.3270779,
-                    -16.3270779,
-                    -10004.8506,
-                ]
-            ],
-        )
-
-        self.assertTrue(ops.allclose(logits, expected_tensor, atol=0.015))
-
-    @slow
-    def test_inference_question_answering_head_conversational_absolute_embeddings(self):
-        # note that google/tapas-small-finetuned-sqa should correspond to tapas_sqa_inter_masklm_small_reset
-        # however here we test the version with absolute position embeddings
-        model = TapasForQuestionAnswering.from_pretrained("google/tapas-small-finetuned-sqa", revision="no_reset")
-        tokenizer = self.default_tokenizer
-        table, queries = prepare_tapas_single_inputs_for_inference()
-        inputs = tokenizer(table=table, queries=queries, return_tensors="ms")
-        inputs = {k: v for k, v in inputs.items()}
-        with no_grad():
-            outputs = model(**inputs)
-        # test the logits
-        logits = outputs.logits
-        expected_shape = (1, 21)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_tensor = mindspore.tensor(
-            [
-                [
-                    -10014.7793,
-                    -10014.7793,
-                    -10014.7793,
-                    -10014.7793,
-                    -10014.7793,
-                    -10014.7793,
-                    -10014.7793,
-                    -10014.7793,
-                    -10014.7793,
-                    -18.8419304,
-                    -10018.0391,
-                    17.7848816,
-                    17.7848816,
-                    17.7848816,
-                    -9981.02832,
-                    -16.4005489,
-                    -16.4005489,
-                    -16.4005489,
-                    -16.4005489,
-                    -16.4005489,
-                    -10013.4736,
-                ]
-            ],
-        )
-
-        self.assertTrue(ops.allclose(logits, expected_tensor, atol=0.01))
-
-    @slow
-    def test_inference_question_answering_head_weak_supervision(self):
-        # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset
-        model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq")
-
-        tokenizer = self.default_tokenizer
-        # let's test on a batch
-        table, queries = prepare_tapas_batch_inputs_for_inference()
-        inputs = tokenizer(table=table, queries=queries, padding="longest", return_tensors="ms")
-        inputs_on_device = {k: v for k, v in inputs.items()}
-
-        with no_grad():
-            outputs = model(**inputs_on_device)
-        # test the logits
-        logits = outputs.logits
-        expected_shape = (2, 28)
-        self.assertEqual(logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [
-                [-160.375504, -160.375504, -160.375504, -10072.3965, -10070.9414, -10094.9736],
-                [-9861.6123, -9861.6123, -9861.6123, -9861.6123, -9891.01172, 146.600677],
-            ],
-        )
-
-        self.assertTrue(ops.allclose(logits[:, -6:], expected_slice, atol=0.4))
-
-        # test the aggregation logits
-        logits_aggregation = outputs.logits_aggregation
-        expected_shape = (2, 4)
-        self.assertEqual(logits_aggregation.shape, expected_shape)
-        expected_tensor = mindspore.tensor(
-            [[18.8545208, -9.76614857, -6.3128891, -2.93525243], [-4.05782509, 40.0351, -5.35329962, 23.3978653]],
-        )
-
-        self.assertTrue(ops.allclose(logits_aggregation, expected_tensor, atol=0.001))
-
-        # test the predicted answer coordinates and aggregation indices
-        EXPECTED_PREDICTED_ANSWER_COORDINATES = [[(0, 0)], [(1, 2)]]
-        EXPECTED_PREDICTED_AGGREGATION_INDICES = [0, 1]
-
-        predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
-            inputs, outputs.logits.detach().cpu(), outputs.logits_aggregation.detach().cpu()
-        )
-
-        self.assertEqual(EXPECTED_PREDICTED_ANSWER_COORDINATES, predicted_answer_coordinates)
-        self.assertEqual(EXPECTED_PREDICTED_AGGREGATION_INDICES, predicted_aggregation_indices)
-
-    @slow
-    def test_training_question_answering_head_weak_supervision(self):
-        # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset
-        model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq")
-        # normally we should put the model in training mode but it's a pain to do this with the TF 1 implementation
-
-        tokenizer = self.default_tokenizer
-        # let's test on a batch
-        table, queries, answer_coordinates, answer_text, float_answer = prepare_tapas_batch_inputs_for_training()
-        inputs = tokenizer(
-            table=table,
-            queries=queries,
-            answer_coordinates=answer_coordinates,
-            answer_text=answer_text,
-            padding="longest",
-            return_tensors="ms",
-        )
-
-        # prepare data (created by the tokenizer) and move to torch_device
-        input_ids = inputs["input_ids"]
-        attention_mask = inputs["attention_mask"]
-        token_type_ids = inputs["token_type_ids"]
-        labels = inputs["labels"]
-        numeric_values = inputs["numeric_values"]
-        numeric_values_scale = inputs["numeric_values_scale"]
-
-        # the answer should be prepared by the user
-        float_answer = mindspore.Tensor(float_answer)
-
-        # forward pass to get loss + logits:
-        with no_grad():
-            outputs = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                token_type_ids=token_type_ids,
-                labels=labels,
-                numeric_values=numeric_values,
-                numeric_values_scale=numeric_values_scale,
-                float_answer=float_answer,
-            )
-
-        # test the loss
-        loss = outputs.loss
-        expected_loss = mindspore.tensor(3.3527612686157227e-08)
-        self.assertTrue(ops.allclose(loss, expected_loss, atol=1e-6))
-
-        # test the logits on the first example
-        logits = outputs.logits
-        expected_shape = (2, 29)
-        self.assertEqual(logits.shape, expected_shape)
-        expected_slice = mindspore.tensor(
-            [
-                -160.0156,
-                -160.0156,
-                -160.0156,
-                -160.0156,
-                -160.0156,
-                -10072.2266,
-                -10070.8896,
-                -10092.6006,
-                -10092.6006,
-            ],
-        )
-
-        self.assertTrue(ops.allclose(logits[0, -9:], expected_slice, atol=1e-6))
-
-        # test the aggregation logits on the second example
-        logits_aggregation = outputs.logits_aggregation
-        expected_shape = (2, 4)
-        self.assertEqual(logits_aggregation.shape, expected_shape)
-        expected_slice = mindspore.tensor([-4.0538, 40.0304, -5.3554, 23.3965])
-
-        self.assertTrue(ops.allclose(logits_aggregation[1, -4:], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_question_answering_head_strong_supervision(self):
-        # note that google/tapas-base-finetuned-wikisql-supervised should correspond to tapas_wikisql_sqa_inter_masklm_base_reset
-        model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wikisql-supervised")
-
-        tokenizer = self.default_tokenizer
-        table, queries = prepare_tapas_single_inputs_for_inference()
-        inputs = tokenizer(table=table, queries=queries, return_tensors="ms")
-        inputs = {k: v for k, v in inputs.items()}
-        with no_grad():
-            outputs = model(**inputs)
-        # test the logits
-        logits = outputs.logits
-        expected_shape = (1, 21)
-        self.assertEqual(logits.shape, expected_shape)
-        expected_tensor = mindspore.tensor(
-            [
-                [
-                    -10011.1084,
-                    -10011.1084,
-                    -10011.1084,
-                    -10011.1084,
-                    -10011.1084,
-                    -10011.1084,
-                    -10011.1084,
-                    -10011.1084,
-                    -10011.1084,
-                    -18.6185989,
-                    -10008.7969,
-                    17.6355762,
-                    17.6355762,
-                    17.6355762,
-                    -10002.4404,
-                    -18.7111301,
-                    -18.7111301,
-                    -18.7111301,
-                    -18.7111301,
-                    -18.7111301,
-                    -10007.0977,
-                ]
-            ],
-        )
-
-        self.assertTrue(ops.allclose(logits, expected_tensor, atol=0.02))
-
-        # test the aggregation logits
-        logits_aggregation = outputs.logits_aggregation
-        expected_shape = (1, 4)
-        self.assertEqual(logits_aggregation.shape, expected_shape)
-        expected_tensor = mindspore.tensor(
-            [[16.5659733, -3.06624889, -2.34152961, -0.970244825]]
-        )  # PyTorch model outputs [[16.5679, -3.0668, -2.3442, -0.9674]]
-
-        self.assertTrue(ops.allclose(logits_aggregation, expected_tensor, atol=0.003))
-
-    @slow
-    def test_inference_classification_head(self):
-        # note that google/tapas-base-finetuned-tabfact should correspond to tapas_tabfact_inter_masklm_base_reset
-        model = TapasForSequenceClassification.from_pretrained("google/tapas-base-finetuned-tabfact")
-
-        tokenizer = self.default_tokenizer
-        table, queries = prepare_tapas_single_inputs_for_inference()
-        inputs = tokenizer(table=table, queries=queries, padding="longest", return_tensors="ms")
-        inputs = {k: v for k, v in inputs.items()}
-        with no_grad():
-            outputs = model(**inputs)
-
-        # test the classification logits
-        logits = outputs.logits
-        expected_shape = (1, 2)
-        self.assertEqual(logits.shape, expected_shape)
-        expected_tensor = mindspore.tensor(
-            [[0.795137286, 9.5572]]
-        )  # Note that the PyTorch model outputs [[0.8057, 9.5281]]
-
-        self.assertTrue(ops.allclose(outputs.logits, expected_tensor, atol=0.05))
-
-
-# Below: tests for Tapas utilities which are defined in modeling_tapas.py.
-# These are based on segmented_tensor_test.py of the original implementation.
-# URL: https://github.com/google-research/tapas/blob/master/tapas/models/segmented_tensor_test.py
-@require_mindspore
-class TapasUtilitiesTest(unittest.TestCase):
-    def _prepare_tables(self):
-        """Prepares two tables, both with three distinct rows.
-        The first table has two columns:
-        1.0, 2.0 | 3.0
-        2.0, 0.0 | 1.0
-        1.0, 3.0 | 4.0
-        The second table has three columns:
-        1.0 | 2.0 | 3.0
-        2.0 | 0.0 | 1.0
-        1.0 | 3.0 | 4.0
-        Returns:
-        SegmentedTensors with the tables.
-        """
-        values = mindspore.tensor(
-            [
-                [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]],
-                [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]],
-            ]
-        )
-        row_index = IndexMap(
-            indices=mindspore.tensor(
-                [
-                    [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
-                    [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
-                ]
-            ),
-            num_segments=3,
-            batch_dims=1,
-        )
-        col_index = IndexMap(
-            indices=mindspore.tensor(
-                [
-                    [[0, 0, 1], [0, 0, 1], [0, 0, 1]],
-                    [[0, 1, 2], [0, 1, 2], [0, 1, 2]],
-                ]
-            ),
-            num_segments=3,
-            batch_dims=1,
-        )
-        return values, row_index, col_index
-
-    def test_product_index(self):
-        _, row_index, col_index = self._prepare_tables()
-        cell_index = ProductIndexMap(row_index, col_index)
-        row_index_proj = cell_index.project_outer(cell_index)
-        col_index_proj = cell_index.project_inner(cell_index)
-
-        ind = cell_index.indices
-        self.assertEqual(cell_index.num_segments, 9)
-
-        # Projections should give back the original indices.
-        # we use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_array_equal(row_index.indices.numpy(), row_index_proj.indices.numpy())
-        self.assertEqual(row_index.num_segments, row_index_proj.num_segments)
-        self.assertEqual(row_index.batch_dims, row_index_proj.batch_dims)
-        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_array_equal(col_index.indices.numpy(), col_index_proj.indices.numpy())
-        self.assertEqual(col_index.batch_dims, col_index_proj.batch_dims)
-
-        # The first and second "column" are identified in the first table.
-        for i in range(3):
-            self.assertEqual(ind[0, i, 0], ind[0, i, 1])
-            self.assertNotEqual(ind[0, i, 0], ind[0, i, 2])
-
-        # All rows are distinct in the first table.
-        for i, i_2 in zip(range(3), range(3)):
-            for j, j_2 in zip(range(3), range(3)):
-                if i != i_2 and j != j_2:
-                    self.assertNotEqual(ind[0, i, j], ind[0, i_2, j_2])
-
-        # All cells are distinct in the second table.
-        for i, i_2 in zip(range(3), range(3)):
-            for j, j_2 in zip(range(3), range(3)):
-                if i != i_2 or j != j_2:
-                    self.assertNotEqual(ind[1, i, j], ind[1, i_2, j_2])
-
-    def test_flatten(self):
-        _, row_index, col_index = self._prepare_tables()
-        row_index_flat = flatten(row_index)
-        col_index_flat = flatten(col_index)
-
-        shape = [3, 4, 5]
-        batched_index = IndexMap(indices=ops.zeros(shape).to(mindspore.int64), num_segments=1, batch_dims=3)
-        batched_index_flat = flatten(batched_index)
-
-        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_array_equal(
-            row_index_flat.indices.numpy(), [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5]
-        )
-        np.testing.assert_array_equal(
-            col_index_flat.indices.numpy(), [0, 0, 1, 0, 0, 1, 0, 0, 1, 3, 4, 5, 3, 4, 5, 3, 4, 5]
-        )
-        self.assertEqual(batched_index_flat.num_segments.numpy(), np.prod(shape))
-        np.testing.assert_array_equal(batched_index_flat.indices.numpy(), range(np.prod(shape)))
-
-    def test_range_index_map(self):
-        batch_shape = [3, 4]
-        num_segments = 5
-        index = range_index_map(batch_shape, num_segments)
-
-        self.assertEqual(num_segments, index.num_segments)
-        self.assertEqual(2, index.batch_dims)
-        indices = index.indices
-        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_array_equal(list(indices.shape), [3, 4, 5])
-        for i in range(batch_shape[0]):
-            for j in range(batch_shape[1]):
-                # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-                np.testing.assert_array_equal(indices[i, j, :].numpy(), range(num_segments))
-
-    @unittest.skip
-    def test_reduce_sum(self):
-        values, row_index, col_index = self._prepare_tables()
-        cell_index = ProductIndexMap(row_index, col_index)
-        row_sum, _ = reduce_sum(values, row_index)
-        col_sum, _ = reduce_sum(values, col_index)
-        cell_sum, _ = reduce_sum(values, cell_index)
-
-        # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose
-        np.testing.assert_allclose(row_sum.numpy(), [[6.0, 3.0, 8.0], [6.0, 3.0, 8.0]])
-        np.testing.assert_allclose(col_sum.numpy(), [[9.0, 8.0, 0.0], [4.0, 5.0, 8.0]])
-        np.testing.assert_allclose(
-            cell_sum.numpy(),
-            [[3.0, 3.0, 0.0, 2.0, 1.0, 0.0, 4.0, 4.0, 0.0], [1.0, 2.0, 3.0, 2.0, 0.0, 1.0, 1.0, 3.0, 4.0]],
-        )
-
-    def test_reduce_mean(self):
-        values, row_index, col_index = self._prepare_tables()
-        cell_index = ProductIndexMap(row_index, col_index)
-        row_mean, _ = reduce_mean(values, row_index)
-        col_mean, _ = reduce_mean(values, col_index)
-        cell_mean, _ = reduce_mean(values, cell_index)
-
-        # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose
-        np.testing.assert_allclose(
-            row_mean.numpy(), [[6.0 / 3.0, 3.0 / 3.0, 8.0 / 3.0], [6.0 / 3.0, 3.0 / 3.0, 8.0 / 3.0]]
-        )
-        np.testing.assert_allclose(col_mean.numpy(), [[9.0 / 6.0, 8.0 / 3.0, 0.0], [4.0 / 3.0, 5.0 / 3.0, 8.0 / 3.0]])
-        np.testing.assert_allclose(
-            cell_mean.numpy(),
-            [
-                [3.0 / 2.0, 3.0, 0.0, 2.0 / 2.0, 1.0, 0.0, 4.0 / 2.0, 4.0, 0.0],
-                [1.0, 2.0, 3.0, 2.0, 0.0, 1.0, 1.0, 3.0, 4.0],
-            ],
-        )
-
-    def test_reduce_max(self):
-        values = ops.as_tensor([2.0, 1.0, 0.0, 3.0])
-        index = IndexMap(indices=ops.as_tensor([0, 1, 0, 1]), num_segments=2)
-        maximum, _ = reduce_max(values, index)
-
-        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_array_equal(maximum.numpy(), [2, 3])
-
-    def test_reduce_sum_vectorized(self):
-        values = ops.as_tensor([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]])
-        index = IndexMap(indices=ops.as_tensor([[0, 0, 1]]), num_segments=2, batch_dims=0)
-        sums, new_index = reduce_sum(values, index)
-
-        # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose
-        np.testing.assert_allclose(sums.numpy(), [3.0, 3.0])
-        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_array_equal(new_index.indices.numpy(), [0, 1])
-        np.testing.assert_array_equal(new_index.num_segments.numpy(), 2)
-        np.testing.assert_array_equal(new_index.batch_dims, 0)
-
-    def test_gather(self):
-        values, row_index, col_index = self._prepare_tables()
-        cell_index = ProductIndexMap(row_index, col_index)
-
-        # Compute sums and then gather. The result should have the same shape as
-        # the original table and each element should contain the sum the values in
-        # its cell.
-        sums, _ = reduce_sum(values, cell_index)
-        cell_sum = gather(sums, cell_index)
-        assert cell_sum.shape == values.shape
-
-        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_allclose(
-            cell_sum.numpy(),
-            [[[3.0, 3.0, 3.0], [2.0, 2.0, 1.0], [4.0, 4.0, 4.0]], [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]]],
-        )
-
-    def test_gather_vectorized(self):
-        values = ops.as_tensor([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
-        index = IndexMap(indices=ops.as_tensor([[0, 1], [1, 0]]), num_segments=2, batch_dims=1)
-        result = gather(values, index)
-
-        # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
-        np.testing.assert_array_equal(result.numpy(), [[[1, 2], [3, 4]], [[7, 8], [5, 6]]])
\ No newline at end of file
diff --git a/tests/transformers/models/tapas/test_tokenization_tapas.py b/tests/transformers/models/tapas/test_tokenization_tapas.py
deleted file mode 100644
index 8e1b70152..000000000
--- a/tests/transformers/models/tapas/test_tokenization_tapas.py
+++ /dev/null
@@ -1,1189 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import inspect
-import os
-import shutil
-import tempfile
-import unittest
-from typing import List
-
-import numpy as np
-import pandas as pd
-from mindspore import ops
-
-from mindnlp.transformers.tokenization_utils import AddedToken
-from mindnlp.utils import  is_mindspore_available
-from mindnlp.transformers.models.tapas.tokenization_tapas import (
-    VOCAB_FILES_NAMES,
-    BasicTokenizer,
-    TapasTokenizer,
-    WordpieceTokenizer,
-    _is_control,
-    _is_punctuation,
-    _is_whitespace,
-)
-from mindnlp.utils.testing_utils import (
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-
-from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english, merge_model_tokenizer_mappings
-
-
-
-@require_tokenizers
-class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "google/tapas-large-finetuned-sqa"
-    tokenizer_class = TapasTokenizer
-    test_rust_tokenizer = False
-    space_between_special_tokens = True
-    from_pretrained_filter = filter_non_english
-    test_seq2seq = False
-
-    def get_table(
-        self,
-        tokenizer: TapasTokenizer,
-        length=5,
-    ):
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-
-        if length == 0:
-            data = {}
-        else:
-            data = {toks[0]: [toks[tok] for tok in range(1, length)]}
-
-        table = pd.DataFrame.from_dict(data)
-
-        return table
-
-    def get_table_and_query(
-        self,
-        tokenizer: TapasTokenizer,
-        length=5,
-    ):
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-        table = self.get_table(tokenizer, length=length - 3)
-        query = " ".join(toks[:3])
-
-        return table, query
-
-    def get_clean_sequence(
-        self,
-        tokenizer: TapasTokenizer,
-        with_prefix_space=False,
-        max_length=20,
-        min_length=5,
-        empty_table: bool = False,
-        add_special_tokens: bool = True,
-        return_table_and_query: bool = False,
-    ):
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-
-        if empty_table:
-            table = pd.DataFrame.from_dict({})
-            query = " ".join(toks[:min_length])
-        else:
-            data = {toks[0]: [toks[tok] for tok in range(1, min_length - 3)]}
-            table = pd.DataFrame.from_dict(data)
-            query = " ".join(toks[:3])
-
-        output_ids = tokenizer.encode(table, query, add_special_tokens=add_special_tokens)
-        output_txt = tokenizer.decode(output_ids)
-
-        assert len(output_ids) >= min_length, "Update the code to generate the sequences so that they are larger"
-        assert len(output_ids) <= max_length, "Update the code to generate the sequences so that they are smaller"
-
-        if return_table_and_query:
-            return output_txt, output_ids, table, query
-
-        return output_txt, output_ids
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00e9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "UNwant\u00e9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        # With lower casing
-        tokenizer = self.get_tokenizer(do_lower_case=True)
-        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
-
-        sequence = "UNwant\u00e9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    @unittest.skip("Chat template tests don't play well with table/layout models.")
-    def test_chat_template_batched(self):
-        pass
-
-    def test_chinese(self):
-        tokenizer = BasicTokenizer()
-
-        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
-
-    def test_basic_tokenizer_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
-
-    def test_basic_tokenizer_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_default(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_no_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_respects_never_split_tokens(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
-        )
-
-    def test_wordpiece_tokenizer(self):
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
-
-        vocab = {}
-        for i, token in enumerate(vocab_tokens):
-            vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
-
-        self.assertListEqual(tokenizer.tokenize(""), [])
-
-        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
-
-        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
-
-    def test_is_whitespace(self):
-        self.assertTrue(_is_whitespace(" "))
-        self.assertTrue(_is_whitespace("\t"))
-        self.assertTrue(_is_whitespace("\r"))
-        self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00a0"))
-
-        self.assertFalse(_is_whitespace("A"))
-        self.assertFalse(_is_whitespace("-"))
-
-    def test_is_control(self):
-        self.assertTrue(_is_control("\u0005"))
-
-        self.assertFalse(_is_control("A"))
-        self.assertFalse(_is_control(" "))
-        self.assertFalse(_is_control("\t"))
-        self.assertFalse(_is_control("\r"))
-
-    def test_is_punctuation(self):
-        self.assertTrue(_is_punctuation("-"))
-        self.assertTrue(_is_punctuation("$"))
-        self.assertTrue(_is_punctuation("`"))
-        self.assertTrue(_is_punctuation("."))
-
-        self.assertFalse(_is_punctuation("A"))
-        self.assertFalse(_is_punctuation(" "))
-
-    def test_clean_text(self):
-        tokenizer = self.get_tokenizer()
-
-        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
-        self.assertListEqual(
-            [tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], ["[EMPTY]"], ["[UNK]"]]
-        )
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("google/tapas-base-finetuned-wtq",from_pt=True)
-
-        empty_table = self.get_table(tokenizer, length=0)
-        table = self.get_table(tokenizer, length=10)
-
-        text = tokenizer.encode(table, add_special_tokens=False)
-        text_2 = tokenizer.encode(empty_table, "multi-sequence build", add_special_tokens=False)
-
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_pair == [101] + text + [102] + text_2
-
-    def test_offsets_with_special_characters(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, from_pt=True, **kwargs)
-
-                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
-                tokens = tokenizer_r.encode_plus(
-                    sentence,
-                    return_attention_mask=False,
-                    return_token_type_ids=False,
-                    return_offsets_mapping=True,
-                    add_special_tokens=True,
-                )
-
-                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
-                expected_results = (
-                    [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "A"),
-                        ((1, 2), ","),
-                        ((3, 5), "na"),
-                        ((5, 6), "##ï"),
-                        ((6, 8), "##ve"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "Allen"),
-                        ((21, 23), "##NL"),
-                        ((23, 24), "##P"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                    if not do_lower_case
-                    else [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "a"),
-                        ((1, 2), ","),
-                        ((3, 8), "naive"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "allen"),
-                        ((21, 23), "##nl"),
-                        ((23, 24), "##p"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                )
-
-                self.assertEqual(
-                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
-                )
-                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
-
-    def test_add_special_tokens(self):
-        tokenizers: List[TapasTokenizer] = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                input_table = self.get_table(tokenizer, length=0)
-
-                special_token = "[SPECIAL_TOKEN]"
-
-                tokenizer.add_special_tokens({"cls_token": special_token})
-                encoded_special_token = tokenizer.encode(input_table, special_token, add_special_tokens=False)
-                self.assertEqual(len(encoded_special_token), 1)
-
-                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
-                self.assertTrue(special_token not in decoded)
-
-    def test_add_tokens_tokenizer(self):
-        tokenizers: List[TapasTokenizer] = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                vocab_size = tokenizer.vocab_size
-                all_size = len(tokenizer)
-
-                self.assertNotEqual(vocab_size, 0)
-
-                # We usually have added tokens from the start in tests because our vocab fixtures are
-                # smaller than the original vocabs - let's not assert this
-                # self.assertEqual(vocab_size, all_size)
-
-                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
-                added_toks = tokenizer.add_tokens(new_toks)
-                vocab_size_2 = tokenizer.vocab_size
-                all_size_2 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_2, 0)
-                self.assertEqual(vocab_size, vocab_size_2)
-                self.assertEqual(added_toks, len(new_toks))
-                self.assertEqual(all_size_2, all_size + len(new_toks))
-
-                tokens = tokenizer.encode(table, "aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
-
-                self.assertGreaterEqual(len(tokens), 4)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-
-                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
-                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-                vocab_size_3 = tokenizer.vocab_size
-                all_size_3 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_3, 0)
-                self.assertEqual(vocab_size, vocab_size_3)
-                self.assertEqual(added_toks_2, len(new_toks_2))
-                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-
-                tokens = tokenizer.encode(
-                    table,
-                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
-                    add_special_tokens=False,
-                )
-
-                self.assertGreaterEqual(len(tokens), 6)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[0], tokens[1])
-                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-2], tokens[-3])
-                self.assertEqual(tokens[0], tokenizer.eos_token_id)
-                self.assertEqual(tokens[-2], tokenizer.pad_token_id)
-
-    @require_tokenizers
-    def test_encode_decode_with_spaces(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-
-                new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
-                tokenizer.add_tokens(new_toks)
-                input = "[ABC][DEF][ABC][DEF]"
-                if self.space_between_special_tokens:
-                    output = "[ABC] [DEF] [ABC] [DEF]"
-                else:
-                    output = input
-                encoded = tokenizer.encode(table, input, add_special_tokens=False)
-                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
-                self.assertIn(decoded, [output, output.lower()])
-
-    def test_encode_plus_with_padding(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence = "Sequence"
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_size = 10
-                padding_idx = tokenizer.pad_token_id
-                token_type_padding_idx = tokenizer.pad_token_type_id
-
-                encoded_sequence = tokenizer.encode_plus(table, sequence, return_special_tokens_mask=True)
-                input_ids = encoded_sequence["input_ids"]
-                special_tokens_mask = encoded_sequence["special_tokens_mask"]
-                sequence_length = len(input_ids)
-
-                # Test 'longest' and 'no_padding' don't do anything
-                tokenizer.padding_side = "right"
-
-                not_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    padding=False,
-                    return_special_tokens_mask=True,
-                )
-                not_padded_input_ids = not_padded_sequence["input_ids"]
-
-                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
-                not_padded_sequence_length = len(not_padded_input_ids)
-
-                assert sequence_length == not_padded_sequence_length
-                assert input_ids == not_padded_input_ids
-                assert special_tokens_mask == not_padded_special_tokens_mask
-
-                not_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    padding=False,
-                    return_special_tokens_mask=True,
-                )
-                not_padded_input_ids = not_padded_sequence["input_ids"]
-
-                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
-                not_padded_sequence_length = len(not_padded_input_ids)
-
-                assert sequence_length == not_padded_sequence_length
-                assert input_ids == not_padded_input_ids
-                assert special_tokens_mask == not_padded_special_tokens_mask
-
-                # Test right padding
-                tokenizer.padding_side = "right"
-
-                right_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
-                right_padded_input_ids = right_padded_sequence["input_ids"]
-
-                right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
-                right_padded_sequence_length = len(right_padded_input_ids)
-
-                assert sequence_length + padding_size == right_padded_sequence_length
-                assert input_ids + [padding_idx] * padding_size == right_padded_input_ids
-                assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask
-
-                # Test left padding
-                tokenizer.padding_side = "left"
-                left_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
-                left_padded_input_ids = left_padded_sequence["input_ids"]
-                left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
-                left_padded_sequence_length = len(left_padded_input_ids)
-
-                assert sequence_length + padding_size == left_padded_sequence_length
-                assert [padding_idx] * padding_size + input_ids == left_padded_input_ids
-                assert [1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask
-
-                if "token_type_ids" in tokenizer.model_input_names:
-                    token_type_ids = encoded_sequence["token_type_ids"]
-                    left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
-                    right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
-
-                    assert (
-                        token_type_ids + [[token_type_padding_idx] * 7] * padding_size == right_padded_token_type_ids
-                    )
-                    assert [[token_type_padding_idx] * 7] * padding_size + token_type_ids == left_padded_token_type_ids
-
-                if "attention_mask" in tokenizer.model_input_names:
-                    attention_mask = encoded_sequence["attention_mask"]
-                    right_padded_attention_mask = right_padded_sequence["attention_mask"]
-                    left_padded_attention_mask = left_padded_sequence["attention_mask"]
-
-                    assert attention_mask + [0] * padding_size == right_padded_attention_mask
-                    assert [0] * padding_size + attention_mask == left_padded_attention_mask
-
-    def test_internal_consistency(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                input_text, output_text = self.get_input_output_texts(tokenizer)
-
-                tokens = tokenizer.tokenize(input_text)
-                ids = tokenizer.convert_tokens_to_ids(tokens)
-                ids_2 = tokenizer.encode(table, input_text, add_special_tokens=False)
-                self.assertListEqual(ids, ids_2)
-
-                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
-                self.assertNotEqual(len(tokens_2), 0)
-                text_2 = tokenizer.decode(ids)
-                self.assertIsInstance(text_2, str)
-
-                self.assertEqual(text_2, output_text)
-
-    def test_mask_output(self):
-        tokenizers = self.get_tokenizers(fast=False, do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table, query = self.get_table_and_query(tokenizer)
-
-                if (
-                    tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
-                    and "token_type_ids" in tokenizer.model_input_names
-                ):
-                    information = tokenizer.encode_plus(table, query, add_special_tokens=True)
-                    sequences, mask = information["input_ids"], information["token_type_ids"]
-                    self.assertEqual(len(sequences), len(mask))
-
-    @unittest.skip("TAPAS tokenizer only handles two sequences.")
-    def test_maximum_encoding_length_pair_input(self):
-        pass
-
-    @unittest.skip("TAPAS tokenizer only handles two sequences.")
-    def test_maximum_encoding_length_single_input(self):
-        pass
-
-    def test_number_of_added_tokens(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table, query = self.get_table_and_query(tokenizer)
-
-                sequences = tokenizer.encode(table, query, add_special_tokens=False)
-                attached_sequences = tokenizer.encode(table, query, add_special_tokens=True)
-
-                # Method is implemented (e.g. not GPT-2)
-                if len(attached_sequences) != 2:
-                    self.assertEqual(
-                        tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
-                    )
-
-    def test_padding_to_max_length(self):
-        """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer)
-                sequence = "Sequence"
-                padding_size = 10
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_idx = tokenizer.pad_token_id
-
-                # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "right"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                # FIXME: the next line should be padding(max_length) to avoid warning
-                padded_sequence = tokenizer.encode(
-                    table, sequence, max_length=sequence_length + padding_size, padding=True
-                )
-                padded_sequence_length = len(padded_sequence)
-                assert sequence_length + padding_size == padded_sequence_length
-                assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
-
-                # Check that nothing is done when a maximum length is not specified
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence, pad_to_max_length=True)
-                padded_sequence_right_length = len(padded_sequence_right)
-                assert sequence_length == padded_sequence_right_length
-                assert encoded_sequence == padded_sequence_right
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                # Test not batched
-                table = self.get_table(tokenizer, length=0)
-                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[0])
-                encoded_sequences_2 = tokenizer(table, sequences[0])
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test not batched pairs
-                table = self.get_table(tokenizer, length=10)
-                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[1])
-                encoded_sequences_2 = tokenizer(table, sequences[1])
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test batched
-                table = self.get_table(tokenizer, length=0)
-                encoded_sequences_1 = tokenizer.batch_encode_plus(table, sequences)
-                encoded_sequences_2 = tokenizer(table, sequences)
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-    def test_batch_encode_plus_batch_sequence_length(self):
-        # Tests that all encoded values have the correct size
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                encoded_sequences = [tokenizer.encode_plus(table, sequence) for sequence in sequences]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(table, sequences, padding=False)
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-                maximum_length = len(
-                    max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
-                )
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences_padded = [
-                    tokenizer.encode_plus(table, sequence, max_length=maximum_length, padding="max_length")
-                    for sequence in sequences
-                ]
-
-                encoded_sequences_batch_padded = tokenizer.batch_encode_plus(table, sequences, padding=True)
-                self.assertListEqual(
-                    encoded_sequences_padded,
-                    self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
-                )
-
-                # check 'longest' is unsensitive to a max length
-                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=True)
-                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=maximum_length + 10, padding="longest"
-                )
-                for key in encoded_sequences_batch_padded_1.keys():
-                    self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key],
-                        encoded_sequences_batch_padded_2[key],
-                    )
-
-                # check 'no_padding' is unsensitive to a max length
-                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=False)
-                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=maximum_length + 10, padding=False
-                )
-                for key in encoded_sequences_batch_padded_1.keys():
-                    self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key],
-                        encoded_sequences_batch_padded_2[key],
-                    )
-
-    @unittest.skip("batch_encode_plus does not handle overflowing tokens.")
-    def test_batch_encode_plus_overflowing_tokens(self):
-        pass
-
-    def test_batch_encode_plus_padding(self):
-        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
-
-        # Right padding tests
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                max_length = 100
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences = [
-                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
-                    for sequence in sequences
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=max_length, padding="max_length"
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-        # Left padding tests
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                tokenizer.padding_side = "left"
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                max_length = 100
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences = [
-                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
-                    for sequence in sequences
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=max_length, padding="max_length"
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-    def test_padding_to_multiple_of(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
-                else:
-                    empty_tokens = tokenizer(table, padding=True, pad_to_multiple_of=8)
-                    normal_tokens = tokenizer(table, "This is a sample input", padding=True, pad_to_multiple_of=8)
-                    for key, value in empty_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    normal_tokens = tokenizer(table, "This", pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    # Should also work with truncation
-                    normal_tokens = tokenizer(table, "This", padding=True, truncation=True, pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-    @unittest.skip("TAPAS cannot handle `prepare_for_model` without passing by `encode_plus` or `batch_encode_plus`")
-    def test_prepare_for_model(self):
-        pass
-
-    def test_tokenizer_slow_store_full_signature(self):
-        signature = inspect.signature(self.tokenizer_class.__init__)
-        tokenizer = self.get_tokenizer()
-
-        for parameter_name, parameter in signature.parameters.items():
-            if parameter.default != inspect.Parameter.empty:
-                self.assertIn(parameter_name, tokenizer.init_kwargs)
-
-    def test_special_tokens_mask_input_pairs(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequence_0 = "Encode this."
-                empty_table = self.get_table(tokenizer, length=0)
-                table = self.get_table(tokenizer, length=10)
-                encoded_sequence = tokenizer.encode(empty_table, sequence_0, add_special_tokens=False)
-                encoded_sequence += tokenizer.encode(table, "", add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
-                    table,
-                    sequence_0,
-                    add_special_tokens=True,
-                    return_special_tokens_mask=True,
-                    # add_prefix_space=False,
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [
-                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
-                ]
-                filtered_sequence = [x for x in filtered_sequence if x is not None]
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    def test_special_tokens_mask(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence_0 = "Encode this."
-                # Testing single inputs
-                encoded_sequence = tokenizer.encode(table, sequence_0, add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
-                    table, sequence_0, add_special_tokens=True, return_special_tokens_mask=True
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    def test_save_and_load_tokenizer(self):
-        # safety check on max_len default value so we are sure the test works
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                self.assertNotEqual(tokenizer.model_max_length, 42)
-
-        # Now let's start the test
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Isolate this from the other tests because we save additional tokens/etc
-                table = self.get_table(tokenizer, length=0)
-                tmpdirname = tempfile.mkdtemp()
-
-                sample_text = " He is very happy, UNwant\u00e9d,running"
-                before_tokens = tokenizer.encode(table, sample_text, add_special_tokens=False)
-                before_vocab = tokenizer.get_vocab()
-                tokenizer.save_pretrained(tmpdirname)
-
-                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, from_pt=True)
-                after_tokens = after_tokenizer.encode(table, sample_text, add_special_tokens=False)
-                after_vocab = after_tokenizer.get_vocab()
-                self.assertListEqual(before_tokens, after_tokens)
-                self.assertDictEqual(before_vocab, after_vocab)
-
-                shutil.rmtree(tmpdirname)
-
-    @unittest.skip("Not implemented")
-    def test_right_and_left_truncation(self):
-        pass
-
-    def test_right_and_left_padding(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence = "Sequence"
-                padding_size = 10
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_idx = tokenizer.pad_token_id
-
-                # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "right"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
-                )
-                padded_sequence_length = len(padded_sequence)
-                assert sequence_length + padding_size == padded_sequence_length
-                assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
-
-                # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "left"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
-                )
-                padded_sequence_length = len(padded_sequence)
-                assert sequence_length + padding_size == padded_sequence_length
-                assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
-
-                # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence, padding=True)
-                padded_sequence_right_length = len(padded_sequence_right)
-                assert sequence_length == padded_sequence_right_length
-                assert encoded_sequence == padded_sequence_right
-
-                tokenizer.padding_side = "left"
-                padded_sequence_left = tokenizer.encode(table, sequence, padding="longest")
-                padded_sequence_left_length = len(padded_sequence_left)
-                assert sequence_length == padded_sequence_left_length
-                assert encoded_sequence == padded_sequence_left
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence)
-                padded_sequence_right_length = len(padded_sequence_right)
-                assert sequence_length == padded_sequence_right_length
-                assert encoded_sequence == padded_sequence_right
-
-                tokenizer.padding_side = "left"
-                padded_sequence_left = tokenizer.encode(table, sequence, padding=False)
-                padded_sequence_left_length = len(padded_sequence_left)
-                assert sequence_length == padded_sequence_left_length
-                assert encoded_sequence == padded_sequence_left
-
-    def test_token_type_ids(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                empty_table = self.get_table(tokenizer, length=0)
-                seq_0 = "Test this method."
-
-                # We want to have sequence 0 and sequence 1 are tagged
-                # respectively with 0 and 1 token_ids
-                # (regardless of whether the model use token type ids)
-                # We use this assumption in the QA pipeline among other place
-                output = tokenizer(empty_table, seq_0, return_token_type_ids=True)
-
-                # Assert that the token type IDs have the same length as the input IDs
-                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
-
-                # Assert that each token type ID has 7 values
-                self.assertTrue(all(len(token_type_ids) == 7 for token_type_ids in output["token_type_ids"]))
-
-                # Do the same test as modeling common.
-                self.assertIn(0, output["token_type_ids"][0])
-
-    @require_mindspore
-    @slow
-    def test_mindspore_encode_plus_sent_to_model(self):
-        import mindspore
-
-        from mindnlp.transformers import MODEL_MAPPING, TOKENIZER_MAPPING
-
-        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
-
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
-
-                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
-                config = config_class()
-
-                if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
-
-                model = model_class(config)
-
-                # Make sure the model contains at least the full vocabulary size in its embedding matrix
-                is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight")
-                assert (
-                    (model.get_input_embeddings().weight.shape[0] >= len(tokenizer))
-                    if is_using_common_embeddings
-                    else True
-                )
-
-                # Build sequence
-                first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
-                sequence = " ".join(first_ten_tokens)
-                table = self.get_table(tokenizer, length=0)
-                encoded_sequence = tokenizer.encode_plus(table, sequence, return_tensors="ms")
-                batch_encoded_sequence = tokenizer.batch_encode_plus(table, [sequence, sequence], return_tensors="ms")
-                # This should not fail
-
-                model(**encoded_sequence)
-                model(**batch_encoded_sequence)
-
-    @unittest.skip("TAPAS doesn't handle pre-tokenized inputs.")
-    def test_pretokenized_inputs(self):
-        pass
-
-    @slow
-    def test_tapas_truncation_integration_test(self):
-        data = {
-            "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-            "Age": ["56", "45", "59"],
-            "Number of movies": ["87", "53", "69"],
-            "Date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
-        }
-        queries = [
-            "When was Brad Pitt born?",
-            "Which actor appeared in the least number of movies?",
-            "What is the average number of movies?",
-        ]
-        table = pd.DataFrame.from_dict(data)
-
-        tokenizer = TapasTokenizer.from_pretrained("lysandre/tapas-temporary-repo", model_max_length=512, from_pt=True)
-
-        for i in range(12):
-            # The table cannot even encode the headers, so raise an error
-            with self.assertRaises(ValueError):
-                tokenizer.encode(table=table, query=queries[0], max_length=i, truncation="drop_rows_to_fit")
-
-        for i in range(12, 512):
-            new_encoded_inputs = tokenizer.encode(
-                table=table, query=queries[0], max_length=i, truncation="drop_rows_to_fit"
-            )
-
-            # Ensure that the input IDs are less than the max length defined.
-            self.assertLessEqual(len(new_encoded_inputs), i)
-
-        tokenizer.model_max_length = 20
-        new_encoded_inputs = tokenizer.encode(table=table, query=queries[0], truncation=True)
-        dropped_encoded_inputs = tokenizer.encode(table=table, query=queries[0], truncation="drop_rows_to_fit")
-
-        # Ensure that the input IDs are still truncated when no max_length is specified
-        self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs)
-        self.assertLessEqual(len(new_encoded_inputs), 20)
-
-    @slow
-    def test_min_max_question_length(self):
-        data = {
-            "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-            "Age": ["56", "45", "59"],
-            "Number of movies": ["87", "53", "69"],
-            "Date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
-        }
-        queries = "When was Brad Pitt born?"
-        table = pd.DataFrame.from_dict(data)
-
-        # test max_question_length
-        tokenizer = TapasTokenizer.from_pretrained("lysandre/tapas-temporary-repo", max_question_length=2, from_pt=True)
-
-        encoding = tokenizer(table=table, queries=queries)
-
-        # query should not be tokenized as it's longer than the specified max_question_length
-        expected_results = [101, 102]
-
-        self.assertListEqual(encoding.input_ids[:2], expected_results)
-
-        # test min_question_length
-        tokenizer = TapasTokenizer.from_pretrained("lysandre/tapas-temporary-repo", min_question_length=30, from_pt=True)
-
-        encoding = tokenizer(table=table, queries=queries)
-
-        # query should not be tokenized as it's shorter than the specified min_question_length
-        expected_results = [101, 102]
-
-        self.assertListEqual(encoding.input_ids[:2], expected_results)
-
-    @slow
-    def test_tapas_integration_test(self):
-        data = {
-            "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-            "Age": ["56", "45", "59"],
-            "Number of movies": ["87", "53", "69"],
-            "Date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
-        }
-        queries = [
-            "When was Brad Pitt born?",
-            "Which actor appeared in the least number of movies?",
-            "What is the average number of movies?",
-        ]
-        table = pd.DataFrame.from_dict(data)
-
-        tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq", model_max_length=512, from_pt=True)
-
-        expected_results = {'input_ids':[101,2043,2001,8226,15091,2141,1029,102,5889,2287,2193,1997,5691,3058,1997,4182,8226,15091,5179,6584,2324,2285,3699,14720,4487,6178,9488,3429,5187,2340,2281,3326,2577,18856,7828,3240,5354,6353,1020,2089,3777],'attention_mask':[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],'token_type_ids':[[0,0,0,0,0,0,0],[0,0,0,0,0,0,0],[0,0,0,0,0,0,0],[0,0,0,0,0,0,0],[0,0,0,0,0,0,0],[0,0,0,0,0,0,0],[0,0,0,0,0,0,0],[0,0,0,0,0,0,0],[1,1,0,0,0,0,0],[1,2,0,0,0,0,0],[1,3,0,0,0,0,0],[1,3,0,0,0,0,0],[1,3,0,0,0,0,0],[1,4,0,0,0,0,0],[1,4,0,0,0,0,0],[1,4,0,0,0,0,0],[1,1,1,0,0,0,0],[1,1,1,0,0,0,0],[1,2,1,0,2,2,0],[1,3,1,0,3,1,0],[1,4,1,0,2,2,0],[1,4,1,0,2,2,0],[1,4,1,0,2,2,0],[1,1,2,0,0,0,0],[1,1,2,0,0,0,0],[1,1,2,0,0,0,0],[1,1,2,0,0,0,0],[1,2,2,0,1,3,0],[1,3,2,0,1,3,0],[1,4,2,0,3,1,0],[1,4,2,0,3,1,0],[1,4,2,0,3,1,0],[1,1,3,0,0,0,0],[1,1,3,0,0,0,0],[1,1,3,0,0,0,0],[1,1,3,0,0,0,0],[1,2,3,0,3,1,0],[1,3,3,0,2,2,0],[1,4,3,0,1,3,0],[1,4,3,0,1,3,0],[1,4,3,0,1,3,0]]}  # fmt: skip
-
-        new_encoded_inputs = tokenizer.encode_plus(table=table, query=queries[0])
-
-        self.assertDictEqual(dict(new_encoded_inputs), expected_results)
-
-    @slow
-    def test_full_tokenizer(self):
-        data = [
-            ["Pos", "No", "Driver", "Team", "Laps", "Time/Retired", "Grid", "Points"],
-            ["1", "32", "Patrick Carpentier", "Team Player's", "87", "1:48:11.023", "1", "22"],
-            ["2", "1", "Bruno Junqueira", "Newman/Haas Racing", "87", "+0.8 secs", "2", "17"],
-            ["3", "3", "Paul Tracy", "Team Player's", "87", "+28.6 secs", "3", "14"],
-            ["4", "9", "Michel Jourdain, Jr.", "Team Rahal", "87", "+40.8 secs", "13", "12"],
-            ["5", "34", "Mario Haberfeld", "Mi-Jack Conquest Racing", "87", "+42.1 secs", "6", "10"],
-            ["6", "20", "Oriol Servia", "Patrick Racing", "87", "+1:00.2", "10", "8"],
-            ["7", "51", "Adrian Fernandez", "Fernandez Racing", "87", "+1:01.4", "5", "6"],
-            ["8", "12", "Jimmy Vasser", "American Spirit Team Johansson", "87", "+1:01.8", "8", "5"],
-            ["9", "7", "Tiago Monteiro", "Fittipaldi-Dingman Racing", "86", "+ 1 Lap", "15", "4"],
-            ["10", "55", "Mario Dominguez", "Herdez Competition", "86", "+ 1 Lap", "11", "3"],
-            ["11", "27", "Bryan Herta", "PK Racing", "86", "+ 1 Lap", "12", "2"],
-            ["12", "31", "Ryan Hunter-Reay", "American Spirit Team Johansson", "86", "+ 1 Lap", "17", "1"],
-            ["13", "19", "Joel Camathias", "Dale Coyne Racing", "85", "+ 2 Laps", "18", "0"],
-            ["14", "33", "Alex Tagliani", "Rocketsports Racing", "85", "+ 2 Laps", "14", "0"],
-            ["15", "4", "Roberto Moreno", "Herdez Competition", "85", "+ 2 Laps", "9", "0"],
-            ["16", "11", "Geoff Boss", "Dale Coyne Racing", "83", "Mechanical", "19", "0"],
-            ["17", "2", "Sebastien Bourdais", "Newman/Haas Racing", "77", "Mechanical", "4", "0"],
-            ["18", "15", "Darren Manning", "Walker Racing", "12", "Mechanical", "7", "0"],
-            ["19", "5", "Rodolfo Lavin", "Walker Racing", "10", "Mechanical", "16", "0"],
-        ]
-        query = "what were the drivers names?"
-        table = pd.DataFrame.from_records(data[1:], columns=data[0])
-
-        tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq", model_max_length=512, from_pt=True)
-        model_inputs = tokenizer(table, query, padding="max_length")
-
-        input_ids = model_inputs["input_ids"]
-        token_type_ids = np.array(model_inputs["token_type_ids"])
-        segment_ids = token_type_ids[:, 0]
-        column_ids = token_type_ids[:, 1]
-        row_ids = token_type_ids[:, 2]
-
-        expected_results = {'input_ids':[101,2054,2020,1996,6853,3415,1029,102,13433,2015,2053,4062,2136,10876,2051,1013,3394,8370,2685,1015,3590,4754,29267,4765,3771,2136,2447,1005,1055,6584,1015,1024,4466,1024,2340,1012,6185,2509,1015,2570,1016,1015,10391,12022,4226,7895,10625,1013,22996,3868,6584,1009,1014,1012,1022,10819,2015,1016,2459,1017,1017,2703,10555,2136,2447,1005,1055,6584,1009,2654,1012,1020,10819,2015,1017,2403,1018,1023,8709,8183,3126,21351,2078,1010,3781,1012,2136,10958,8865,6584,1009,2871,1012,1022,10819,2015,2410,2260,1019,4090,7986,5292,5677,8151,2771,1011,2990,9187,3868,6584,1009,4413,1012,1015,10819,2015,1020,2184,1020,2322,2030,20282,14262,9035,4754,3868,6584,1009,1015,1024,4002,1012,1016,2184,1022,1021,4868,7918,12023,12023,3868,6584,1009,1015,1024,5890,1012,1018,1019,1020,1022,2260,5261,12436,18116,2137,4382,2136,26447,6584,1009,1015,1024,5890,1012,1022,1022,1019,1023,1021,27339,3995,10125,9711,4906,25101,24657,1011,22033,2386,3868,6564,1009,1015,5001,2321,1018,2184,4583,7986,14383,2075,29488,14906,9351,2971,6564,1009,1015,5001,2340,1017,2340,2676,8527,2014,2696,1052,2243,3868,6564,1009,1015,5001,2260,1016,2260,2861,4575,4477,1011,2128,4710,2137,4382,2136,26447,6564,1009,1015,5001,2459,1015,2410,2539,8963,11503,25457,3022,8512,2522,9654,3868,5594,1009,1016,10876,2324,1014,2403,3943,4074,6415,15204,2072,12496,25378,3868,5594,1009,1016,10876,2403,1014,2321,1018,10704,17921,14906,9351,2971,5594,1009,1016,10876,1023,1014,2385,2340,14915,5795,8512,2522,9654,3868,6640,6228,2539,1014,2459,1016,28328,8945,3126,21351,2015,10625,1013,22996,3868,6255,6228,1018,1014,2324,2321,12270,11956,5232,3868,2260,6228,1021,1014,2539,1019,8473,28027,2080,2474,6371,5232,3868,2184,6228,2385,1014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],'column_ids':[0,0,0,0,0,0,0,0,1,1,2,3,4,5,6,6,6,7,8,1,2,3,3,3,3,4,4,4,4,5,6,6,6,6,6,6,6,6,7,8,1,2,3,3,3,3,4,4,4,4,5,6,6,6,6,6,6,7,8,1,2,3,3,4,4,4,4,5,6,6,6,6,6,6,7,8,1,2,3,3,3,3,3,3,3,3,4,4,4,5,6,6,6,6,6,6,7,8,1,2,3,3,3,3,4,4,4,4,4,5,6,6,6,6,6,6,7,8,1,2,3,3,3,3,4,4,5,6,6,6,6,6,6,7,8,1,2,3,3,4,4,5,6,6,6,6,6,6,7,8,1,2,3,3,3,4,4,4,4,5,6,6,6,6,6,6,7,8,1,2,3,3,3,3,4,4,4,4,4,4,4,5,6,6,6,7,8,1,2,3,3,3,3,4,4,4,5,6,6,6,7,8,1,2,3,3,3,4,4,4,5,6,6,6,7,8,1,2,3,3,3,3,3,4,4,4,4,5,6,6,6,7,8,1,2,3,3,3,3,4,4,4,4,5,6,6,6,7,8,1,2,3,3,3,3,4,4,4,5,6,6,6,7,8,1,2,3,3,4,4,4,5,6,6,6,7,8,1,2,3,3,4,4,4,4,5,6,7,8,1,2,3,3,3,3,3,4,4,4,4,5,6,7,8,1,2,3,3,4,4,5,6,7,8,1,2,3,3,3,3,3,4,4,5,6,7,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],'row_ids':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,19,19,19,19,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],'segment_ids':[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}  # fmt: skip
-
-        self.assertListEqual(input_ids, expected_results["input_ids"])
-        self.assertListEqual(segment_ids.tolist(), expected_results["segment_ids"])
-        self.assertListEqual(column_ids.tolist(), expected_results["column_ids"])
-        self.assertListEqual(row_ids.tolist(), expected_results["row_ids"])
-
-    @unittest.skip("Doesn't support another framework than PyTorch")
-    def test_np_encode_plus_sent_to_model(self):
-        pass
-
-    @unittest.skip("Chat is not supported")
-    def test_chat_template(self):
-        pass
-
-    @unittest.skip(reason="no pretrained tokenizer for tapas model")
-    def test_pretrained_model_lists(self):
-        pass
diff --git a/tests/transformers/models/tapex/__init__.py b/tests/transformers/models/tapex/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/tapex/test_tokenization_tapex.py b/tests/transformers/models/tapex/test_tokenization_tapex.py
deleted file mode 100644
index 65e36abb8..000000000
--- a/tests/transformers/models/tapex/test_tokenization_tapex.py
+++ /dev/null
@@ -1,912 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-from typing import List
-
-import pandas as pd
-
-from tokenizers import AddedToken
-from mindnlp.transformers import TapexTokenizer
-from mindnlp.transformers.models.tapex.tokenization_tapex import VOCAB_FILES_NAMES
-from mindnlp.utils.testing_utils import slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-# @require_pandas
-class TapexTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    tokenizer_class = TapexTokenizer
-    test_rust_tokenizer = False
-    from_pretrained_kwargs = {"cls_token": "<s>"}
-    test_seq2seq = False
-
-    def setUp(self):
-        super().setUp()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        # fmt: off
-        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]  # noqa: E231
-        # fmt: on
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-        self.tmpdirname = tempfile.mkdtemp()
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    def get_table(self, tokenizer, length=5):
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-
-        if length == 0:
-            data = {}
-        else:
-            data = {toks[0]: [toks[tok] for tok in range(1, length)]}
-
-        table = pd.DataFrame.from_dict(data)
-
-        return table
-
-    def get_table_and_query(self, tokenizer, length=5):
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-        table = self.get_table(tokenizer, length=length - 3)
-        query = " ".join(toks[:3])
-
-        return table, query
-
-    def get_clean_sequence(
-        self,
-        tokenizer,
-        with_prefix_space=False,
-        max_length=20,
-        min_length=5,
-        empty_table: bool = False,
-        add_special_tokens: bool = True,
-        return_table_and_query: bool = False,
-    ):
-        toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
-
-        if empty_table:
-            table = pd.DataFrame.from_dict({})
-            query = " ".join(toks[:min_length])
-        else:
-            data = {toks[0]: [toks[tok] for tok in range(1, min_length - 3)]}
-            table = pd.DataFrame.from_dict(data)
-            query = " ".join(toks[:3])
-
-        output_ids = tokenizer.encode(table, query, add_special_tokens=add_special_tokens)
-        output_txt = tokenizer.decode(output_ids)
-
-        if len(output_ids) < min_length:
-            raise ValueError("Update the code to generate the sequences so that they are larger")
-        if len(output_ids) > max_length:
-            raise ValueError("Update the code to generate the sequences so that they are smaller")
-
-        if return_table_and_query:
-            return output_txt, output_ids, table, query
-
-        return output_txt, output_ids
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer_roberta(self):
-
-        tokenizer = self.tokenizer_class(vocab_file=self.vocab_file,merges_file=self.merges_file, **self.special_tokens_map)
-        # vocab_file=r'C:\Users\Nxt03\Downloads\vocab.json'
-        # merges_file=r'C:\Users\Nxt03\Downloads\merges_file'
-
-        text = "lower newer"
-        bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    def roberta_dict_integration_testing(self):
-        tokenizer = self.get_tokenizer()
-
-        self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
-        self.assertListEqual(
-            tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
-            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
-        )
-
-    def test_add_tokens_tokenizer(self):
-        tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                vocab_size = tokenizer.vocab_size
-                all_size = len(tokenizer)
-
-                self.assertNotEqual(vocab_size, 0)
-
-                # We usually have added tokens from the start in tests because our vocab fixtures are
-                # smaller than the original vocabs - let's not assert this
-                # self.assertEqual(vocab_size, all_size)
-
-                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
-                added_toks = tokenizer.add_tokens(new_toks)
-                vocab_size_2 = tokenizer.vocab_size
-                all_size_2 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_2, 0)
-                self.assertEqual(vocab_size, vocab_size_2)
-                self.assertEqual(added_toks, len(new_toks))
-                self.assertEqual(all_size_2, all_size + len(new_toks))
-
-                tokens = tokenizer.encode(table, "aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
-
-                self.assertGreaterEqual(len(tokens), 4)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-
-                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
-                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-                vocab_size_3 = tokenizer.vocab_size
-                all_size_3 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_3, 0)
-                self.assertEqual(vocab_size, vocab_size_3)
-                self.assertEqual(added_toks_2, len(new_toks_2))
-                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-
-                tokens = tokenizer.encode(
-                    table,
-                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
-                    add_special_tokens=False,
-                )
-
-                self.assertGreaterEqual(len(tokens), 6)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[0], tokens[1])
-                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-2], tokens[-3])
-                self.assertEqual(tokens[0], tokenizer.eos_token_id)
-                self.assertEqual(tokens[-2], tokenizer.pad_token_id)
-
-    def test_token_type_ids(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                empty_table = self.get_table(tokenizer, length=0)
-                seq_0 = "Test this method."
-
-                # We want to have sequence 0 and sequence 1 are tagged
-                # respectively with 0 and 1 token_ids
-                # (regardless of whether the model use token type ids)
-                # We use this assumption in the QA pipeline among other place
-                output = tokenizer(empty_table, seq_0, return_token_type_ids=True)
-
-                # Assert that the token type IDs have the same length as the input IDs
-                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
-                self.assertIn(0, output["token_type_ids"])
-
-    def test_add_special_tokens(self):
-        tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                input_table = self.get_table(tokenizer, length=0)
-
-                special_token = "[SPECIAL_TOKEN]"
-
-                tokenizer.add_special_tokens({"cls_token": special_token})
-                encoded_special_token = tokenizer.encode(input_table, special_token, add_special_tokens=False)
-                self.assertEqual(len(encoded_special_token), 1)
-
-                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
-                self.assertTrue(special_token not in decoded)
-
-    def test_batch_encode_plus_overflowing_tokens(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            table = self.get_table(tokenizer, length=10)
-            string_sequences = ["Testing the prepare_for_model method.", "Test"]
-
-            if tokenizer.pad_token is None:
-                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-
-            tokenizer.batch_encode_plus(
-                table, string_sequences, return_overflowing_tokens=True, truncation=True, padding=True, max_length=3
-            )
-
-    # @is_pt_tf_cross_test
-    def test_batch_encode_plus_tensors(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                table = self.get_table(tokenizer, length=0)
-
-                # A Tensor cannot be build by sequences which are not the same size
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="ms")
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="ms")
-
-                if tokenizer.pad_token_id is None:
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        table,
-                        sequences,
-                        padding=True,
-                        return_tensors="ms",
-                    )
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        table,
-                        sequences,
-                        padding="longest",
-                        return_tensors="ms",
-                    )
-                else:
-                    pytorch_tensor = tokenizer.batch_encode_plus(table, sequences, padding=True, return_tensors="ms")
-                    tensorflow_tensor = tokenizer.batch_encode_plus(
-                        table, sequences, padding="longest", return_tensors="ms"
-                    )
-                    encoded_sequences = tokenizer.batch_encode_plus(table, sequences, padding=True)
-
-                    for key in encoded_sequences.keys():
-                        pytorch_value = pytorch_tensor[key].tolist()
-                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
-                        encoded_value = encoded_sequences[key]
-
-                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                # Test not batched
-                table = self.get_table(tokenizer, length=0)
-                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[0])
-                encoded_sequences_2 = tokenizer(table, sequences[0])
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test not batched pairs
-                table = self.get_table(tokenizer, length=10)
-                encoded_sequences_1 = tokenizer.encode_plus(table, sequences[1])
-                encoded_sequences_2 = tokenizer(table, sequences[1])
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test batched
-                table = self.get_table(tokenizer, length=0)
-                encoded_sequences_1 = tokenizer.batch_encode_plus(table, sequences)
-                encoded_sequences_2 = tokenizer(table, sequences)
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-    def test_internal_consistency(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                input_text, output_text = self.get_input_output_texts(tokenizer)
-
-                tokens = tokenizer.tokenize(input_text)
-                ids = tokenizer.convert_tokens_to_ids(tokens)
-                ids_2 = tokenizer.encode(table, input_text, add_special_tokens=False)
-                self.assertListEqual(ids, ids_2)
-
-                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
-                self.assertNotEqual(len(tokens_2), 0)
-                text_2 = tokenizer.decode(ids)
-                self.assertIsInstance(text_2, str)
-
-                self.assertEqual(text_2, output_text)
-
-    def test_save_and_load_tokenizer(self):
-        # safety check on max_len default value so we are sure the test works
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                self.assertNotEqual(tokenizer.model_max_length, 42)
-
-        # Now let's start the test
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Isolate this from the other tests because we save additional tokens/etc
-                table = self.get_table(tokenizer, length=0)
-                tmpdirname = tempfile.mkdtemp()
-
-                sample_text = " He is very happy, UNwant\u00E9d,running"
-                before_tokens = tokenizer.encode(table, sample_text, add_special_tokens=False)
-                before_vocab = tokenizer.get_vocab()
-                tokenizer.save_pretrained(tmpdirname)
-
-                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
-                after_tokens = after_tokenizer.encode(table, sample_text, add_special_tokens=False)
-                after_vocab = after_tokenizer.get_vocab()
-                self.assertListEqual(before_tokens, after_tokens)
-                self.assertDictEqual(before_vocab, after_vocab)
-
-                shutil.rmtree(tmpdirname)
-
-    def test_number_of_added_tokens(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table, query = self.get_table_and_query(tokenizer)
-
-                sequences = tokenizer.encode(table, query, add_special_tokens=False)
-                attached_sequences = tokenizer.encode(table, query, add_special_tokens=True)
-
-                self.assertEqual(2, len(attached_sequences) - len(sequences))
-
-    @unittest.skip("TAPEX cannot handle `prepare_for_model` without passing by `encode_plus` or `batch_encode_plus`")
-    def test_prepare_for_model(self):
-        pass
-
-    @unittest.skip("TAPEX tokenizer does not support pairs.")
-    def test_maximum_encoding_length_pair_input(self):
-        pass
-
-    @unittest.skip("TAPEX tokenizer does not support pairs.")
-    def test_maximum_encoding_length_single_input(self):
-        pass
-
-    @unittest.skip("Not implemented")
-    def test_right_and_left_truncation(self):
-        pass
-
-    def test_encode_decode_with_spaces(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-
-                new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
-                tokenizer.add_tokens(new_toks)
-                input = "[ABC][DEF][ABC][DEF]"
-                if self.space_between_special_tokens:
-                    output = "[ABC] [DEF] [ABC] [DEF]"
-                else:
-                    output = input
-                encoded = tokenizer.encode(table, input, add_special_tokens=False)
-                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
-                self.assertIn(decoded, [output, output.lower()])
-
-    def test_tokenize_special_tokens(self):
-        """Test `tokenize` with special tokens."""
-        tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]"
-                SPECIAL_TOKEN_2 = "[SPECIAL_TOKEN_2]"
-
-                # import pdb
-                # pdb.set_trace()
-                # TODO:
-                # Can we combine `unique_no_split_tokens` and `all_special_tokens`(and properties related to it)
-                # with one variable(property) for a better maintainability?
-
-                # `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py)
-
-                #this is wrong method to add SPECIAL
-                #tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
-
-                # `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`,
-                # which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py)
-                tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
-
-                # token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
-                token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2)
-
-                # self.assertEqual(len(token_1), 1)
-                self.assertEqual(len(token_2), 1)
-                # self.assertEqual(token_1[0], SPECIAL_TOKEN_1)
-                self.assertEqual(token_2[0], SPECIAL_TOKEN_2)
-
-    def test_special_tokens_mask(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence_0 = "Encode this."
-                # Testing single inputs
-                encoded_sequence = tokenizer.encode(table, sequence_0, add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
-                    table, sequence_0, add_special_tokens=True, return_special_tokens_mask=True
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    def test_padding_to_max_length(self):
-        """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer)
-                sequence = "Sequence"
-                padding_size = 10
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_idx = tokenizer.pad_token_id
-
-                # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "right"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    pad_to_max_length=True,
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
-
-                # Check that nothing is done when a maximum length is not specified
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence, pad_to_max_length=True)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_right)
-
-    def test_padding_to_multiple_of(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
-                else:
-                    empty_tokens = tokenizer(table, padding=True, pad_to_multiple_of=8)
-                    normal_tokens = tokenizer(table, "This is a sample input", padding=True, pad_to_multiple_of=8)
-                    for key, value in empty_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    normal_tokens = tokenizer(table, "This", pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    # Should also work with truncation
-                    normal_tokens = tokenizer(table, "This", padding=True, truncation=True, pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-    def test_right_and_left_padding(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence = "Sequence"
-                padding_size = 10
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_idx = tokenizer.pad_token_id
-
-                # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "right"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
-
-                # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "left"
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    table, sequence, max_length=sequence_length + padding_size, padding="max_length"
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertListEqual([padding_idx] * padding_size + encoded_sequence, padded_sequence)
-
-                # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
-                encoded_sequence = tokenizer.encode(table, sequence)
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence, padding=True)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_right)
-
-                tokenizer.padding_side = "left"
-                padded_sequence_left = tokenizer.encode(table, sequence, padding="longest")
-                padded_sequence_left_length = len(padded_sequence_left)
-                self.assertEqual(sequence_length, padded_sequence_left_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_left)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(table, sequence)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_right)
-
-                tokenizer.padding_side = "left"
-                padded_sequence_left = tokenizer.encode(table, sequence, padding=False)
-                padded_sequence_left_length = len(padded_sequence_left)
-                self.assertEqual(sequence_length, padded_sequence_left_length)
-                self.assertListEqual(encoded_sequence, padded_sequence_left)
-
-    def test_encode_plus_with_padding(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequence = "Sequence"
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_size = 10
-                padding_idx = tokenizer.pad_token_id
-                token_type_padding_idx = tokenizer.pad_token_type_id
-
-                encoded_sequence = tokenizer.encode_plus(table, sequence, return_special_tokens_mask=True)
-                input_ids = encoded_sequence["input_ids"]
-                special_tokens_mask = encoded_sequence["special_tokens_mask"]
-                sequence_length = len(input_ids)
-
-                # Test 'longest' and 'no_padding' don't do anything
-                tokenizer.padding_side = "right"
-
-                not_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    padding=False,
-                    return_special_tokens_mask=True,
-                )
-                not_padded_input_ids = not_padded_sequence["input_ids"]
-
-                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
-                not_padded_sequence_length = len(not_padded_input_ids)
-
-                self.assertEqual(sequence_length, not_padded_sequence_length)
-                self.assertListEqual(input_ids, not_padded_input_ids)
-                self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
-
-                not_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    padding=False,
-                    return_special_tokens_mask=True,
-                )
-                not_padded_input_ids = not_padded_sequence["input_ids"]
-
-                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
-                not_padded_sequence_length = len(not_padded_input_ids)
-
-                self.assertEqual(sequence_length, not_padded_sequence_length)
-                self.assertListEqual(input_ids, not_padded_input_ids)
-                self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
-
-                # Test right padding
-                tokenizer.padding_side = "right"
-
-                right_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
-                right_padded_input_ids = right_padded_sequence["input_ids"]
-
-                right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
-                right_padded_sequence_length = len(right_padded_input_ids)
-
-                self.assertEqual(sequence_length + padding_size, right_padded_sequence_length)
-                self.assertListEqual(input_ids + [padding_idx] * padding_size, right_padded_input_ids)
-                self.assertListEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
-
-                # Test left padding
-                tokenizer.padding_side = "left"
-                left_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
-                left_padded_input_ids = left_padded_sequence["input_ids"]
-                left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
-                left_padded_sequence_length = len(left_padded_input_ids)
-
-                self.assertEqual(sequence_length + padding_size, left_padded_sequence_length)
-                self.assertListEqual([padding_idx] * padding_size + input_ids, left_padded_input_ids)
-                self.assertListEqual([1] * padding_size + special_tokens_mask, left_padded_special_tokens_mask)
-
-                if "token_type_ids" in tokenizer.model_input_names:
-                    token_type_ids = encoded_sequence["token_type_ids"]
-                    left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
-                    right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
-
-                    self.assertListEqual(
-                        (token_type_ids + [[token_type_padding_idx] * 7] * padding_size, right_padded_token_type_ids)
-                    )
-                    self.assertListEqual(
-                        [[token_type_padding_idx] * 7] * padding_size + token_type_ids, left_padded_token_type_ids
-                    )
-
-                if "attention_mask" in tokenizer.model_input_names:
-                    attention_mask = encoded_sequence["attention_mask"]
-                    right_padded_attention_mask = right_padded_sequence["attention_mask"]
-                    left_padded_attention_mask = left_padded_sequence["attention_mask"]
-
-                    self.assertListEqual(attention_mask + [0] * padding_size, right_padded_attention_mask)
-                    self.assertListEqual([0] * padding_size + attention_mask, left_padded_attention_mask)
-
-    def test_batch_encode_plus_padding(self):
-        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
-
-        # Right padding tests
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                max_length = 100
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences = [
-                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
-                    for sequence in sequences
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=max_length, padding="max_length"
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-        # Left padding tests
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                tokenizer.padding_side = "left"
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                max_length = 100
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences = [
-                    tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
-                    for sequence in sequences
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=max_length, padding="max_length"
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-    def test_batch_encode_plus_batch_sequence_length(self):
-        # Tests that all encoded values have the correct size
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                table = self.get_table(tokenizer, length=0)
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                encoded_sequences = [tokenizer.encode_plus(table, sequence) for sequence in sequences]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(table, sequences, padding=False)
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-                maximum_length = len(
-                    max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
-                )
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences_padded = [
-                    tokenizer.encode_plus(table, sequence, max_length=maximum_length, padding="max_length")
-                    for sequence in sequences
-                ]
-
-                encoded_sequences_batch_padded = tokenizer.batch_encode_plus(table, sequences, padding=True)
-                self.assertListEqual(
-                    encoded_sequences_padded,
-                    self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
-                )
-
-                # check 'longest' is unsensitive to a max length
-                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=True)
-                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=maximum_length + 10, padding="longest"
-                )
-                for key in encoded_sequences_batch_padded_1.keys():
-                    self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key],
-                        encoded_sequences_batch_padded_2[key],
-                    )
-
-                # check 'no_padding' is unsensitive to a max length
-                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=False)
-                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
-                    table, sequences, max_length=maximum_length + 10, padding=False
-                )
-                for key in encoded_sequences_batch_padded_1.keys():
-                    self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key],
-                        encoded_sequences_batch_padded_2[key],
-                    )
-
-    def test_special_tokens_mask_input_pairs(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequence_0 = "Encode this."
-                empty_table = self.get_table(tokenizer, length=0)
-                table = self.get_table(tokenizer, length=10)
-                encoded_sequence = tokenizer.encode(empty_table, sequence_0, add_special_tokens=False)
-                number_of_tokens = len(encoded_sequence)
-                encoded_sequence += tokenizer.encode(table, "", add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
-                    table,
-                    sequence_0,
-                    add_special_tokens=True,
-                    return_special_tokens_mask=True,
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [
-                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
-                ]
-                # NOTE: as TAPEX adds a space between a table and a sequence, we need to remove it
-                # in order to have equivalent results with encoding an empty table or empty sequence
-                del filtered_sequence[number_of_tokens + 1]
-                filtered_sequence = [x for x in filtered_sequence if x is not None]
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    # @slow
-    def test_full_tokenizer(self):
-        question = "Greece held its last Summer Olympics in 2004"
-        table_dict = {
-            "header": ["Year", "City", "Country", "Nations"],
-            "rows": [
-                [1896, "Athens", "Greece", 14],
-                [1900, "Paris", "France", 24],
-                [1904, "St. Louis", "USA", 12],
-                [2004, "Athens", "Greece", 201],
-                [2008, "Beijing", "China", 204],
-                [2012, "London", "UK", 204],
-            ],
-        }
-        table = pd.DataFrame.from_dict(table_dict["rows"])
-        table.columns = table_dict["header"]
-
-        tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
-        encoding = tokenizer(table, question)
-
-        # fmt: off
-        expected_results = {'input_ids': [0, 821, 5314, 1755, 547, 63, 94, 1035, 1021, 31434, 2857, 11, 4482, 11311, 4832, 76, 1721, 343, 1721, 247, 1721, 3949, 3236, 112, 4832, 42773, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 501, 3236, 132, 4832, 23137, 1721, 2242, 354, 1721, 6664, 2389, 1721, 706, 3236, 155, 4832, 42224, 1721, 1690, 4, 26120, 354, 1721, 201, 102, 1721, 316, 3236, 204, 4832, 4482, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 21458, 3236, 195, 4832, 2266, 1721, 28, 40049, 1721, 1855, 1243, 1721, 28325, 3236, 231, 4832, 1125, 1721, 784, 24639, 1721, 1717, 330, 1721, 28325, 2]}
-        # fmt: on
-
-        self.assertListEqual(encoding.input_ids, expected_results["input_ids"])
-
-    def test_tokenizer_as_target(self):
-        # by default the tokenizer do_lower_case
-        tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base")
-        answer_text = "tapex is a good model!"
-        expected_src_tokens = [0, 90, 5776, 1178, 16, 10, 205, 1421, 328, 2]
-        answer_encoding = tokenizer(answer=answer_text)
-        self.assertListEqual(answer_encoding.input_ids, expected_src_tokens)
-
-    # @slow
-    def test_tokenizer_lower_case(self):
-        cased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=False)
-        uncased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=True)
-        answer_text = "Beijing, London, Paris"
-        answer_text_lower = "beijing, london, paris"
-
-        self.assertNotEqual(
-            cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids
-        )
-        self.assertEqual(
-            cased_tokenizer(answer=answer_text_lower).input_ids,
-            uncased_tokenizer(answer=answer_text).input_ids,
-        )
-        # batched encoding assert
-        self.assertNotEqual(
-            cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids
-        )
-        self.assertEqual(
-            cased_tokenizer(answer=[answer_text_lower]).input_ids,
-            uncased_tokenizer(answer=[answer_text]).input_ids,
-        )
-        # test input encoding lowercase
-        question = "Greece held its last Summer Olympics in 2004"
-        table_dict = {
-            "header": ["Year", "City", "Country", "Nations"],
-            "rows": [
-                [1896, "Athens", "Greece", 14],
-                [1900, "Paris", "France", 24],
-                [1904, "St. Louis", "USA", 12],
-                [2004, "Athens", "Greece", 201],
-                [2008, "Beijing", "China", 204],
-                [2012, "London", "UK", 204],
-            ],
-        }
-        table = pd.DataFrame.from_dict(table_dict["rows"])
-        table.columns = table_dict["header"]
-
-        self.assertNotEqual(
-            cased_tokenizer(table=table, query=question).input_ids,
-            uncased_tokenizer(table=table, query=question).input_ids,
-        )
diff --git a/tests/transformers/models/time_series_transformer/__init__.py b/tests/transformers/models/time_series_transformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/transformers/models/time_series_transformer/test_modeling_time_series_transformer.py
deleted file mode 100644
index ae8293d5c..000000000
--- a/tests/transformers/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ /dev/null
@@ -1,551 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch TimeSeriesTransformer model."""
-
-import inspect
-import tempfile
-import unittest
-
-from huggingface_hub import hf_hub_download
-from parameterized import parameterized
-
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import is_flaky, require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-TOLERANCE = 1e-4
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-    from mindnlp.core.serialization import load
-
-    from mindnlp.transformers import (
-        TimeSeriesTransformerConfig,
-        TimeSeriesTransformerForPrediction,
-        TimeSeriesTransformerModel,
-    )
-    from mindnlp.transformers.models.time_series_transformer.modeling_time_series_transformer import (
-        TimeSeriesTransformerDecoder,
-        TimeSeriesTransformerEncoder,
-    )
-
-
-@require_mindspore
-class TimeSeriesTransformerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        prediction_length=7,
-        context_length=14,
-        cardinality=19,
-        embedding_dimension=5,
-        num_time_features=4,
-        is_training=True,
-        hidden_size=64,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=4,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        lags_sequence=[1, 2, 3, 4, 5],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.prediction_length = prediction_length
-        self.context_length = context_length
-        self.cardinality = cardinality
-        self.num_time_features = num_time_features
-        self.lags_sequence = lags_sequence
-        self.embedding_dimension = embedding_dimension
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-
-        self.encoder_seq_length = context_length
-        self.decoder_seq_length = prediction_length
-
-    def get_config(self):
-        return TimeSeriesTransformerConfig(
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            prediction_length=self.prediction_length,
-            context_length=self.context_length,
-            lags_sequence=self.lags_sequence,
-            num_time_features=self.num_time_features,
-            num_static_real_features=1,
-            num_static_categorical_features=1,
-            cardinality=[self.cardinality],
-            embedding_dimension=[self.embedding_dimension],
-            scaling="std",  # we need std to get non-zero `loc`
-        )
-
-    def prepare_time_series_transformer_inputs_dict(self, config):
-        _past_length = config.context_length + max(config.lags_sequence)
-
-        static_categorical_features = ids_tensor([self.batch_size, 1], config.cardinality[0])
-        static_real_features = floats_tensor([self.batch_size, 1])
-
-        past_time_features = floats_tensor([self.batch_size, _past_length, config.num_time_features])
-        past_values = floats_tensor([self.batch_size, _past_length])
-        past_observed_mask = floats_tensor([self.batch_size, _past_length]) > 0.5
-
-        # decoder inputs
-        future_time_features = floats_tensor([self.batch_size, config.prediction_length, config.num_time_features])
-        future_values = floats_tensor([self.batch_size, config.prediction_length])
-
-        inputs_dict = {
-            "past_values": past_values,
-            "static_categorical_features": static_categorical_features,
-            "static_real_features": static_real_features,
-            "past_time_features": past_time_features,
-            "past_observed_mask": past_observed_mask,
-            "future_time_features": future_time_features,
-            "future_values": future_values,
-        }
-        return inputs_dict
-
-    def prepare_config_and_inputs(self):
-        config = self.get_config()
-        inputs_dict = self.prepare_time_series_transformer_inputs_dict(config)
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = TimeSeriesTransformerModel(config=config).eval()
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = TimeSeriesTransformerEncoder.from_pretrained(tmpdirname)
-
-        transformer_inputs, _, _, _ = model.create_network_inputs(**inputs_dict)
-        enc_input = transformer_inputs[:, : config.context_length, ...]
-        dec_input = transformer_inputs[:, config.context_length :, ...]
-
-        encoder_last_hidden_state_2 = encoder(inputs_embeds=enc_input)[0]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = TimeSeriesTransformerDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            inputs_embeds=dec_input,
-            encoder_hidden_states=encoder_last_hidden_state,
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-@require_mindspore
-class TimeSeriesTransformerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TimeSeriesTransformerModel, TimeSeriesTransformerForPrediction) if is_mindspore_available() else ()
-    )
-    all_generative_model_classes = (TimeSeriesTransformerForPrediction,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"feature-extraction": TimeSeriesTransformerModel} if is_mindspore_available() else {}
-    is_encoder_decoder = True
-    test_pruning = False
-    test_head_masking = False
-    test_missing_keys = False
-    test_torchscript = False
-    test_inputs_embeds = False
-
-    def setUp(self):
-        self.model_tester = TimeSeriesTransformerModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=TimeSeriesTransformerConfig,
-            has_text_modality=False,
-            prediction_length=self.model_tester.prediction_length,
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, _ = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    @unittest.skip(reason="Model has no tokens embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # # Input is 'static_categorical_features' not 'input_ids'
-    def test_model_main_input_name(self):
-        model_signature = inspect.signature(getattr(TimeSeriesTransformerModel, "forward"))
-        # The main input is the name of the argument after `self`
-        observed_main_input_name = list(model_signature.parameters.keys())[1]
-        self.assertEqual(TimeSeriesTransformerModel.main_input_name, observed_main_input_name)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "past_values",
-                "past_time_features",
-                "past_observed_mask",
-                "static_categorical_features",
-                "static_real_features",
-                "future_values",
-                "future_time_features",
-            ]
-
-            expected_arg_names.extend(
-                [
-                    "future_observed_mask",
-                    "decoder_attention_mask",
-                    "head_mask",
-                    "decoder_head_mask",
-                    "cross_attn_head_mask",
-                    "encoder_outputs",
-                    "past_key_values",
-                    "output_hidden_states",
-                    "output_attentions",
-                    "use_cache",
-                    "return_dict",
-                ]
-                if "future_observed_mask" in arg_names
-                else [
-                    "decoder_attention_mask",
-                    "head_mask",
-                    "decoder_head_mask",
-                    "cross_attn_head_mask",
-                    "encoder_outputs",
-                    "past_key_values",
-                    "output_hidden_states",
-                    "output_attentions",
-                    "use_cache",
-                    "return_dict",
-                ]
-            )
-
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 7
-
-            if "last_hidden_state" in outputs:
-                correct_outlen += 1
-
-            if "past_key_values" in outputs:
-                correct_outlen += 1  # past_key_values have been returned
-
-            if "loss" in outputs:
-                correct_outlen += 1
-
-            if "params" in outputs:
-                correct_outlen += 1
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_seq_length],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    decoder_seq_length,
-                    encoder_seq_length,
-                ],
-            )
-
-        # Check attention is always last and order is fine
-        inputs_dict["output_attentions"] = True
-        inputs_dict["output_hidden_states"] = True
-        model = model_class(config)
-        model.eval()
-        with no_grad():
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-        self.assertEqual(out_len + 2, len(outputs))
-
-        self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-        self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-        self.assertListEqual(
-            list(self_attentions[0].shape[-3:]),
-            [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length],
-        )
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @parameterized.expand(
-        [
-            (1, 5, [1]),
-            (1, 5, [1, 10, 15]),
-            (1, 5, [3, 6, 9, 10]),
-            (2, 5, [1, 2, 7]),
-            (2, 5, [2, 3, 4, 6]),
-            (4, 5, [1, 5, 9, 11]),
-            (4, 5, [7, 8, 13, 14]),
-        ],
-    )
-    def test_create_network_inputs(self, prediction_length, context_length, lags_sequence):
-        history_length = max(lags_sequence) + context_length
-
-        config = TimeSeriesTransformerConfig(
-            prediction_length=prediction_length,
-            context_length=context_length,
-            lags_sequence=lags_sequence,
-            scaling=False,
-            num_parallel_samples=10,
-            num_static_categorical_features=1,
-            cardinality=[1],
-            embedding_dimension=[2],
-            num_static_real_features=1,
-        )
-        model = TimeSeriesTransformerModel(config)
-
-        batch = {
-            "static_categorical_features": mindspore.tensor([[0]], dtype=mindspore.int64),
-            "static_real_features": mindspore.tensor([[0.0]], dtype=mindspore.float32),
-            "past_time_features": ops.arange(history_length, dtype=mindspore.float32).view(1, history_length, 1),
-            "past_values": ops.arange(history_length, dtype=mindspore.float32).view(1, history_length),
-            "past_observed_mask": ops.arange(history_length, dtype=mindspore.float32).view(1, history_length),
-        }
-
-        # test with no future_target (only one step prediction)
-        batch["future_time_features"] = ops.arange(history_length, history_length + 1, dtype=mindspore.float32).view(
-            1, 1, 1
-        )
-        transformer_inputs, loc, scale, _ = model.create_network_inputs(**batch)
-
-        self.assertTrue((scale == 1.0).all())
-        assert (loc == 0.0).all()
-
-        ref = ops.arange(max(lags_sequence), history_length, dtype=mindspore.float32)
-
-        for idx, lag in enumerate(lags_sequence):
-            assert ops.isclose(ref - lag, transformer_inputs[0, :, idx]).all()
-
-        # test with all future data
-        batch["future_time_features"] = ops.arange(
-            history_length, history_length + prediction_length, dtype=mindspore.float32
-        ).view(1, prediction_length, 1)
-        batch["future_values"] = ops.arange(
-            history_length, history_length + prediction_length, dtype=mindspore.float32
-        ).view(1, prediction_length)
-        transformer_inputs, loc, scale, _ = model.create_network_inputs(**batch)
-
-        assert (scale == 1.0).all()
-        assert (loc == 0.0).all()
-
-        ref = ops.arange(max(lags_sequence), history_length + prediction_length, dtype=mindspore.float32)
-
-        for idx, lag in enumerate(lags_sequence):
-            assert ops.isclose(ref - lag, transformer_inputs[0, :, idx]).all()
-
-        # test for generation
-        batch.pop("future_values")
-        transformer_inputs, loc, scale, _ = model.create_network_inputs(**batch)
-
-        lagged_sequence = model.get_lagged_subsequences(
-            sequence=batch["past_values"],
-            subsequences_length=1,
-            shift=1,
-        )
-        # assert that the last element of the lagged sequence is the one after the encoders input
-        assert transformer_inputs[0, ..., 0][-1] + 1 == lagged_sequence[0, ..., 0][-1]
-
-        future_values = ops.arange(history_length, history_length + prediction_length, dtype=mindspore.float32).view(
-            1, prediction_length
-        )
-        # assert that the first element of the future_values is offset by lag after the decoders input
-        assert lagged_sequence[0, ..., 0][-1] + lags_sequence[0] == future_values[0, ..., 0]
-
-    @unittest.skip(reason="Model does not have input embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-
-def prepare_batch(filename="train-batch.pt"):
-    file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
-    batch = load(file)
-    return batch
-
-
-@require_mindspore
-@slow
-class TimeSeriesTransformerModelIntegrationTests(unittest.TestCase):
-    def test_inference_no_head(self):
-        model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly")
-        batch = prepare_batch()
-
-        with no_grad():
-            output = model(
-                past_values=batch["past_values"],
-                past_time_features=batch["past_time_features"],
-                past_observed_mask=batch["past_observed_mask"],
-                static_categorical_features=batch["static_categorical_features"],
-                static_real_features=batch["static_real_features"],
-                future_values=batch["future_values"],
-                future_time_features=batch["future_time_features"],
-            ).last_hidden_state
-
-        expected_shape = (64, model.config.context_length, model.config.d_model)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[0.8196, -1.5131, 1.4620], [1.1268, -1.3238, 1.5997], [1.5098, -1.0715, 1.7359]]
-        )
-        self.assertTrue(ops.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
-
-    def test_inference_head(self):
-        model = TimeSeriesTransformerForPrediction.from_pretrained(
-            "huggingface/time-series-transformer-tourism-monthly"
-        )
-        batch = prepare_batch("val-batch.pt")
-        with no_grad():
-            output = model(
-                past_values=batch["past_values"],
-                past_time_features=batch["past_time_features"],
-                past_observed_mask=batch["past_observed_mask"],
-                static_categorical_features=batch["static_categorical_features"],
-                static_real_features=batch["static_real_features"],
-                future_time_features=batch["future_time_features"],
-            ).encoder_last_hidden_state
-        expected_shape = (64, model.config.context_length, model.config.d_model)
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[-1.2957, -1.0280, -0.6045], [-0.7017, -0.8193, -0.3717], [-1.0449, -0.8149, 0.1405]]
-        )
-        self.assertTrue(ops.allclose(output[0, :3, :3], expected_slice, atol=TOLERANCE))
-
-    def test_seq_to_seq_generation(self):
-        model = TimeSeriesTransformerForPrediction.from_pretrained(
-            "huggingface/time-series-transformer-tourism-monthly"
-        )
-        batch = prepare_batch("val-batch.pt")
-        with no_grad():
-            outputs = model.generate(
-                static_categorical_features=batch["static_categorical_features"],
-                static_real_features=batch["static_real_features"],
-                past_time_features=batch["past_time_features"],
-                past_values=batch["past_values"],
-                future_time_features=batch["future_time_features"],
-                past_observed_mask=batch["past_observed_mask"],
-            )
-        expected_shape = (64, model.config.num_parallel_samples, model.config.prediction_length)
-        self.assertEqual(outputs.sequences.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([2825.2749, 3584.9207, 6763.9951])
-        mean_prediction = outputs.sequences.mean(dim=1)
-        self.assertTrue(ops.allclose(mean_prediction[0, -3:], expected_slice, rtol=1e-1))
\ No newline at end of file
diff --git a/tests/transformers/models/timesformer/__init__.py b/tests/transformers/models/timesformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/timesformer/test_modeling_timesformer.py b/tests/transformers/models/timesformer/test_modeling_timesformer.py
deleted file mode 100644
index 0f31d66a8..000000000
--- a/tests/transformers/models/timesformer/test_modeling_timesformer.py
+++ /dev/null
@@ -1,346 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch TimeSformer model."""
-
-import copy
-import unittest
-
-import numpy as np
-from huggingface_hub import hf_hub_download
-
-from mindnlp.transformers import TimesformerConfig
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops, no_grad
-
-    from mindnlp.transformers import (
-        MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
-        TimesformerForVideoClassification,
-        TimesformerModel,
-    )
-
-
-if is_vision_available():
-    from mindnlp.transformers import VideoMAEImageProcessor
-
-
-class TimesformerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=10,
-        num_channels=3,
-        patch_size=2,
-        num_frames=2,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        num_labels=10,
-        initializer_range=0.02,
-        attention_type="divided_space_time",
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.num_frames = num_frames
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.attention_type = attention_type
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.num_labels = num_labels
-
-        # in TimeSformer, the number of spatial tokens equals num_frames * num_patches per frame + 1 CLS token
-        self.num_patches_per_frame = (image_size // patch_size) ** 2
-        self.seq_length = (num_frames) * self.num_patches_per_frame + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [self.batch_size, self.num_frames, self.num_channels, self.image_size, self.image_size]
-        )
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        config = TimesformerConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            num_frames=self.num_frames,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-            attention_type=self.attention_type,
-        )
-        config.num_labels = self.num_labels
-        return config
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = TimesformerModel(config=config)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_video_classification(self, config, pixel_values, labels):
-        model = TimesformerForVideoClassification(config)
-        model.eval()
-
-        result = model(pixel_values)
-
-        # verify the logits shape
-        expected_shape = (self.batch_size, self.num_labels)
-        self.parent.assertEqual(result.logits.shape, expected_shape)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class TimesformerModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as TimeSformer does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (TimesformerModel, TimesformerForVideoClassification) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": TimesformerModel, "video-classification": TimesformerForVideoClassification}
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = TimesformerModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=TimesformerConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING):
-                inputs_dict["labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-
-        return inputs_dict
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="TimeSformer does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_video_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_video_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/timesformer-base-finetuned-k400"
-        model = TimesformerModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model has no attentions")
-
-        else:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            for model_class in self.all_model_classes:
-                seq_len = self.model_tester.seq_length
-                num_frames = self.model_tester.num_frames
-
-                inputs_dict["output_attentions"] = True
-                inputs_dict["output_hidden_states"] = False
-                config.return_dict = True
-                model = model_class(config)
-                model.eval()
-                with no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-                attentions = outputs.attentions
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-                # check that output_attentions also work using config
-                del inputs_dict["output_attentions"]
-                config.output_attentions = True
-                model = model_class(config)
-                model.eval()
-                with no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-                attentions = outputs.attentions
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-                # attentions has shape (batch_size x num_frames) x num_heads x (num_patches per frame + 1) x (num_patches per frame + 1)
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len // num_frames + 1, seq_len // num_frames + 1],
-                )
-                out_len = len(outputs)
-
-                # Check attention is always last and order is fine
-                inputs_dict["output_attentions"] = True
-                inputs_dict["output_hidden_states"] = True
-                model = model_class(config)
-                model.eval()
-                with no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-                self.assertEqual(out_len + 1, len(outputs))
-
-                self_attentions = outputs.attentions
-
-                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-
-                # attentions has shape (batch_size x num_frames) x num_heads x (num_patches per frame + 1) x (num_patches per frame + 1)
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len // num_frames + 1, seq_len // num_frames + 1],
-                )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-            expected_num_layers = self.model_tester.num_hidden_layers + 1
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-
-# We will verify our results on a video of eating spaghetti
-# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
-def prepare_video():
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
-    )
-    video = np.load(file)
-    return list(video)
-
-
-@require_mindspore
-@require_vision
-class TimesformerModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        # logits were tested with a different mean and std, so we use the same here
-        return (
-            VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_for_video_classification(self):
-        model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400")
-
-        image_processor = self.default_image_processor
-        video = prepare_video()
-        inputs = image_processor(video[:8], return_tensors="ms")
-
-        # forward pass
-        with no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 400)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-0.3016, -0.7713, -0.4205])
-
-        self.assertTrue(ops.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/tinybert/__init__.py b/tests/transformers/models/tinybert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/trocr/__init__.py b/tests/transformers/models/trocr/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/trocr/test_modeling_trocr.py b/tests/transformers/models/trocr/test_modeling_trocr.py
deleted file mode 100644
index 9484906a8..000000000
--- a/tests/transformers/models/trocr/test_modeling_trocr.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch TrOCR model."""
-
-import unittest
-
-import numpy as np
-from mindnlp.transformers import TrOCRConfig
-from mindnlp.utils.testing_utils import is_mindspore_available, require_mindspore
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-
-if is_mindspore_available():
-    from mindspore import ops
-
-    from mindnlp.transformers.models.trocr.modeling_trocr import TrOCRDecoder, TrOCRForCausalLM
-
-
-@require_mindspore
-class TrOCRStandaloneDecoderModelTester:
-    def __init__(
-            self,
-            parent,
-            vocab_size=99,
-            batch_size=13,
-            d_model=16,
-            decoder_seq_length=7,
-            is_training=True,
-            is_decoder=True,
-            use_attention_mask=True,
-            use_cache=False,
-            use_labels=True,
-            decoder_start_token_id=2,
-            decoder_ffn_dim=32,
-            decoder_layers=2,
-            decoder_attention_heads=4,
-            max_position_embeddings=30,
-            pad_token_id=0,
-            bos_token_id=1,
-            eos_token_id=2,
-            scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.hidden_size = d_model
-        self.num_hidden_layers = decoder_layers
-        self.decoder_layers = decoder_layers
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.decoder_attention_heads = decoder_attention_heads
-        self.num_attention_heads = decoder_attention_heads
-        self.eos_token_id = eos_token_id
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-
-        self.scope = None
-        self.decoder_key_length = decoder_seq_length
-        self.base_model_out_len = 2
-        self.decoder_attention_idx = 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = TrOCRConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.d_model,
-            decoder_layers=self.decoder_layers,
-            decoder_ffn_dim=self.decoder_ffn_dim,
-            decoder_attention_heads=self.decoder_attention_heads,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            use_cache=self.use_cache,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-            max_position_embeddings=self.max_position_embeddings,
-        )
-
-        return config, input_ids, attention_mask, lm_labels
-
-    def create_and_check_decoder_model_past(
-            self,
-            config,
-            input_ids,
-            attention_mask,
-            lm_labels,
-    ):
-        config.use_cache = True
-        model = TrOCRDecoder(config=config).set_train(False)
-        input_ids = input_ids[:2]
-
-        input_ids[input_ids == 0] += 1
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((2, 1), config.vocab_size - 1) + 1
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, lm_labels = config_and_inputs
-
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class TrOCRStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (TrOCRForCausalLM, ) if is_mindspore_available() else ()
-    all_generative_model_classes = (TrOCRForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"text-generation": TrOCRForCausalLM} if is_mindspore_available() else {}
-    fx_compatible = True
-    test_pruning = False
-
-    def setUp(self):
-        self.model_tester = TrOCRStandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=TrOCRConfig)
-
-    # not implemented currently
-    def test_inputs_embeds(self):
-        pass
-
-    # TrOCR has no base model
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    # TrOCR has no base model
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
-
-    # decoder cannot keep gradients
-    def test_retain_grad_hidden_states_attentions(self):
-        return
-
-    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
-    def test_left_padding_compatibility(self):
-        pass
diff --git a/tests/transformers/models/tvlt/__init__.py b/tests/transformers/models/tvlt/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/tvlt/test_modeling_tvlt.py b/tests/transformers/models/tvlt/test_modeling_tvlt.py
deleted file mode 100644
index 7ab3de130..000000000
--- a/tests/transformers/models/tvlt/test_modeling_tvlt.py
+++ /dev/null
@@ -1,604 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindsporeTVLT model. """
-
-import copy
-import inspect
-import unittest
-
-import numpy as np
-import os
-os.environ['HF_ENDPOINT']='https://hf-mirror.com'
-from huggingface_hub import hf_hub_download
-
-from mindnlp.transformers import TvltConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_vision_available, is_mindspore_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import TvltForAudioVisualClassification, TvltForPreTraining, TvltModel
-
-    from datasets import load_dataset
-    from mindnlp.transformers import TvltImageProcessor
-    from mindnlp.transformers import TvltFeatureExtractor
-
-class TvltModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        image_size=32,
-        spectrogram_length=32,
-        frequency_length=16,
-        image_patch_size=[2, 2],
-        audio_patch_size=[2, 2],
-        num_image_channels=3,
-        num_audio_channels=1,
-        num_frames=2,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=128,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        qkv_bias=True,
-        use_mean_pooling=True,
-        decoder_num_attention_heads=4,
-        decoder_hidden_size=32,
-        decoder_num_hidden_layers=2,
-        decoder_intermediate_size=128,
-        image_mask_ratio=0.75,
-        audio_mask_ratio=0.15,
-        audio_mask_type="frame-level",
-        task_matching=True,
-        task_mae=True,
-        num_labels=1,
-        is_training=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.spectrogram_length = spectrogram_length
-        self.frequency_length = frequency_length
-        self.image_patch_size = image_patch_size
-        self.audio_patch_size = audio_patch_size
-        self.num_image_channels = num_image_channels
-        self.num_audio_channels = num_audio_channels
-        self.num_frames = num_frames
-
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.qkv_bias = qkv_bias
-        self.use_mean_pooling = use_mean_pooling
-
-        self.decoder_num_attention_heads = decoder_num_attention_heads
-        self.decoder_hidden_size = decoder_hidden_size
-        self.decoder_num_hidden_layers = decoder_num_hidden_layers
-        self.decoder_intermediate_size = decoder_intermediate_size
-        self.image_mask_ratio = image_mask_ratio
-        self.audio_mask_ratio = audio_mask_ratio
-
-        self.task_matching = task_matching
-        self.task_mae = task_mae
-        self.num_labels = num_labels
-
-        self.expected_pixel_seq_len = (self.image_size // self.image_patch_size[0]) ** 2 * self.num_frames
-        self.expected_audio_seq_len = (self.spectrogram_length // self.audio_patch_size[0]) * (
-            self.frequency_length // self.audio_patch_size[1]
-        )
-        # we set the expected sequence length (which is used in several tests)
-        # this is equal to the seq length of number of image/video patches + number of audio patches
-        self.expected_seq_len = self.expected_pixel_seq_len + self.expected_audio_seq_len + 1
-
-        self.image_mae_output_dim = image_patch_size[0] ** 2 * num_image_channels
-        self.audio_mae_output_dim = audio_patch_size[0] * audio_patch_size[1] * num_audio_channels
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
-        )
-        audio_values = floats_tensor(
-            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
-        )
-
-        pixel_mask = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
-        audio_mask = floats_tensor([self.batch_size, self.expected_audio_seq_len])
-
-        config = self.get_config()
-
-        return (config, pixel_values, audio_values, pixel_mask, audio_mask)
-
-    def prepare_config_and_inputs_for_pretraining(self):
-        pixel_values = floats_tensor(
-            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
-        )
-        audio_values = floats_tensor(
-            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
-        )
-
-        pixel_mask = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
-        audio_mask = floats_tensor([self.batch_size, self.expected_audio_seq_len])
-
-        pixel_values_mixed = floats_tensor(
-            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
-        )
-        pixel_mask_mixed = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
-        labels = floats_tensor([self.batch_size])
-        config = self.get_config()
-
-        return (
-            config,
-            pixel_values,
-            audio_values,
-            pixel_mask,
-            audio_mask,
-            pixel_values_mixed,
-            pixel_mask_mixed,
-            labels,
-        )
-
-    def get_config(self):
-        return TvltConfig(
-            image_size=self.image_size,
-            spectrogram_length=self.spectrogram_length,
-            frequency_length=self.frequency_length,
-            image_patch_size=self.image_patch_size,
-            audio_patch_size=self.audio_patch_size,
-            num_image_channels=self.num_image_channels,
-            num_audio_channels=self.num_audio_channels,
-            num_frames=self.num_frames,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-            layer_norm_eps=self.layer_norm_eps,
-            qkv_bias=self.qkv_bias,
-            use_mean_pooling=self.use_mean_pooling,
-            decoder_num_attention_heads=self.decoder_num_attention_heads,
-            decoder_hidden_size=self.decoder_hidden_size,
-            decoder_num_hidden_layers=self.decoder_num_hidden_layers,
-            decoder_intermediate_size=self.decoder_intermediate_size,
-            image_mask_ratio=self.image_mask_ratio,
-            audio_mask_ratio=self.audio_mask_ratio,
-            task_matching=self.task_matching,
-            task_mae=self.task_mae,
-            num_labels=self.num_labels,
-        )
-
-    def create_and_check_model(self, config, pixel_values, audio_values, pixel_mask, audio_mask):
-        model = TvltModel(config=config)
-        model.eval()
-        result = model(pixel_values, audio_values, pixel_mask=pixel_mask, audio_mask=audio_mask)
-        result = model(pixel_values, audio_values)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.expected_seq_len, self.hidden_size)
-        )
-
-    def create_and_check_for_audiovisual_classification(
-        self, config, pixel_values, audio_values, pixel_mask, audio_mask
-    ):
-        model = TvltForAudioVisualClassification(config=config)
-        model.eval()
-        result = model(pixel_values, audio_values, pixel_mask=pixel_mask, audio_mask=audio_mask)
-        result = model(pixel_values, audio_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_pretraining(
-        self,
-        config,
-        pixel_values,
-        audio_values,
-        pixel_mask,
-        audio_mask,
-        pixel_values_mixed,
-        pixel_mask_mixed,
-        labels,
-    ):
-        model = TvltForPreTraining(config=config)
-        model.set_train()
-        result = model(
-            pixel_values,
-            audio_values,
-            pixel_mask,
-            audio_mask,
-            pixel_values_mixed=pixel_values_mixed,
-            pixel_mask_mixed=pixel_mask_mixed,
-            labels=labels,
-        )
-        self.parent.assertEqual(
-            result.pixel_logits.shape, (self.batch_size, self.expected_pixel_seq_len, self.image_mae_output_dim)
-        )
-        self.parent.assertEqual(
-            result.audio_logits.shape, (self.batch_size, self.expected_audio_seq_len, self.audio_mae_output_dim)
-        )
-        self.parent.assertEqual(result.matching_logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_pretraining_inference(
-        self,
-        config,
-        pixel_values,
-        audio_values,
-        pixel_mask,
-        audio_mask,
-        pixel_values_mixed,
-        pixel_mask_mixed,
-        labels,
-    ):
-        model = TvltForPreTraining(config=config)
-        model.eval()
-        result = model(
-            pixel_values,
-            audio_values,
-            pixel_mask,
-            audio_mask,
-            pixel_values_mixed=pixel_values_mixed,
-            pixel_mask_mixed=pixel_mask_mixed,
-            labels=labels,
-        )
-        if result.pixel_logits is not None:
-            self.parent.assertEqual(
-                result.pixel_logits.shape, (self.batch_size, self.expected_pixel_seq_len, self.image_mae_output_dim)
-            )
-        if result.audio_logits is not None:
-            self.parent.assertEqual(
-                result.audio_logits.shape, (self.batch_size, self.expected_audio_seq_len, self.audio_mae_output_dim)
-            )
-        self.parent.assertEqual(result.matching_logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, pixel_values, audio_values, pixel_mask, audio_mask) = config_and_inputs
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "audio_values": audio_values,
-            "pixel_mask": pixel_mask,
-            "audio_mask": audio_mask,
-        }
-        return config, inputs_dict
-
-    def prepare_pixel_values(self):
-        return floats_tensor(
-            [self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
-        )
-
-    def prepare_audio_values(self):
-        return floats_tensor(
-            [self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
-        )
-
-
-@require_mindspore
-class TvltModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TvltModel, TvltForPreTraining, TvltForAudioVisualClassification) if is_mindspore_available() else ()
-    )
-    pipeline_model_mapping = {"feature-extraction": TvltModel} if is_mindspore_available() else {}
-
-    fx_compatible = False
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-    test_resize_embeddings = False
-    main_input_name = "pixel_values"
-
-    # TvltForAudioVisualClassification and TvltForPreTraining require special treatment
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=True):
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if return_labels:
-            if model_class.__name__ == "TvltForAudioVisualClassification":
-                inputs_dict["labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-            elif model_class.__name__ == "TvltForPreTraining":
-                inputs_dict["labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.float32
-                )
-                inputs_dict["pixel_values_mixed"] = ops.zeros(
-                    self.model_tester.batch_size,
-                    self.model_tester.num_frames,
-                    self.model_tester.num_image_channels,
-                    self.model_tester.image_size,
-                    self.model_tester.image_size,
-                    dtype=mindspore.float32
-                )
-                inputs_dict["pixel_mask_mixed"] = ops.zeros(
-                    self.model_tester.batch_size, self.model_tester.expected_pixel_seq_len,
-                    dtype=mindspore.float32,
-                )
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = TvltModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TvltConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="TVLT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            input_embeddings = model.get_input_embeddings()
-            self.assertIsInstance(input_embeddings, (tuple))
-            for embedding in input_embeddings:
-                self.assertIsInstance(embedding, (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values", "audio_values"]
-            self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_audiovisual_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_audiovisual_classification(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pretraining()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-        self.model_tester.create_and_check_for_pretraining_inference(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "ZinengTang/tvlt-base"
-        model = TvltModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes[1:]:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            model = model_class(config)
-            model.set_train()
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            loss = model(**inputs).loss
-            # loss.backward()
-
-    def test_training_gradient_checkpointing(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes[1:]:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.use_cache = False
-            config.return_dict = True
-
-            model = model_class(config)
-            # model.gradient_checkpointing_enable()
-            model.supports_gradient_checkpointing = True
-            model.set_train()
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            loss = model(**inputs).loss
-            # loss.backward()
-
-    def test_attention_outputs(self):
-        if not self.has_attentions:
-            pass
-
-        else:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            for model_class in self.all_model_classes[2:]:
-                seq_len = self.model_tester.expected_seq_len
-
-                inputs_dict["output_attentions"] = True
-                inputs_dict["output_hidden_states"] = False
-                config.return_dict = True
-                model = model_class(config)
-                model.eval()
-                with mindspore._no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-                attentions = outputs.attentions
-
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-                # check that output_attentions also work using config
-                del inputs_dict["output_attentions"]
-                config.output_attentions = True
-                model = model_class(config)
-                model.eval()
-                with mindspore._no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-                attentions = outputs.attentions
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len, seq_len],
-                )
-                out_len = len(outputs)
-
-                # Check attention is always last and order is fine
-                inputs_dict["output_attentions"] = True
-                inputs_dict["output_hidden_states"] = True
-                model = model_class(config)
-                model.eval()
-                with mindspore._no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-                self.assertEqual(out_len + 1, len(outputs))
-
-                self_attentions = outputs.attentions
-
-                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len, seq_len],
-                )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with mindspore._no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-            expected_num_layers = self.model_tester.num_hidden_layers + 1
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            seq_length = self.model_tester.expected_seq_len
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes[2:]:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-    
-    def test_model_get_set_embeddings(self):
-        pass
-
-
-# We will verify our results on a video of eating spaghetti
-# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
-def prepare_video(num_frames=8):
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
-    )
-    video = np.load(file)[:num_frames]
-    return list(video)
-
-
-def prepare_audio(num_samples=1):
-    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-    # automatic decoding with librispeech
-    speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-    return [x["array"] for x in speech_samples]
-
-
-@require_mindspore
-@require_vision
-class TvltModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_processors(self):
-        # logits were tested with a different mean and std, so we use the same here
-        return (
-            TvltImageProcessor() if is_vision_available() else None,
-            TvltFeatureExtractor(),
-        )
-
-    def test_inference_for_base_model(self):
-        model = TvltModel.from_pretrained("ZinengTang/tvlt-base")
-
-        image_processor, audio_feature_extractor = self.default_processors
-        video = prepare_video()
-        audio = prepare_audio()
-        video_inputs = image_processor(video, return_tensors="ms")
-        audio_inputs = audio_feature_extractor(audio, return_tensors="ms")
-        inputs = {}
-        inputs.update(video_inputs)
-        inputs.update(audio_inputs)
-
-        # forward pass
-        with mindspore._no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_last_hidden_state_slice = mindspore.tensor([[-0.0186, -0.0691], [0.0242, -0.0398]])
-        print(outputs.last_hidden_state[:, :2, :2].numpy())
-        self.assertTrue(
-            np.allclose(outputs.last_hidden_state[:, :2, :2].numpy(), expected_last_hidden_state_slice.numpy(), atol=1e-4)
-        )
-
-    def test_inference_for_pretraining(self):
-        model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base")
-
-        image_processor, audio_feature_extractor = self.default_processors
-        video = prepare_video()
-        video_mixed = prepare_video()
-        audio = prepare_audio()
-        video_inputs = image_processor(video, return_tensors="ms", mask_pixel=True)
-        video_mixed_inputs = image_processor(video_mixed, is_mixed=True, return_tensors="ms")
-        audio_inputs = audio_feature_extractor(audio, return_tensors="ms", mask_audio=True)
-        labels = mindspore.tensor([[0.0]])
-        inputs = {}
-        inputs.update(video_inputs)
-        inputs.update(video_mixed_inputs)
-        inputs.update(audio_inputs)
-        inputs.update({"labels": labels})
-
-        # forward pass
-        with mindspore._no_grad():
-            outputs = model(**inputs)
-
-        # verify the logits
-        expected_pixel_logits_shape = (1, 1568, 768)
-        expected_audio_logits_shape = (1, 96, 256)
-        expected_matching_logits_shape = (1, 1)
-
-        if outputs.pixel_logits is not None:
-            self.assertEqual(outputs.pixel_logits.shape, expected_pixel_logits_shape)
-        if outputs.audio_logits is not None:
-            self.assertEqual(outputs.audio_logits.shape, expected_audio_logits_shape)
-        self.assertTrue(outputs.matching_logits.shape, expected_matching_logits_shape)
\ No newline at end of file
diff --git a/tests/transformers/models/udop/__init__.py b/tests/transformers/models/udop/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/udop/test_modeling_udop.py b/tests/transformers/models/udop/test_modeling_udop.py
deleted file mode 100644
index a2f2c10a1..000000000
--- a/tests/transformers/models/udop/test_modeling_udop.py
+++ /dev/null
@@ -1,575 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import inspect
-import unittest
-
-from huggingface_hub import hf_hub_download
-
-from mindnlp.transformers import UdopConfig
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, nn, no_grad
-    from mindnlp.engine import set_seed
-
-    from mindnlp.transformers import UdopEncoderModel, UdopForConditionalGeneration, UdopModel, UdopProcessor
-
-
-if is_vision_available():
-    from PIL import Image
-
-
-class UdopModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        decoder_seq_length=9,
-        # For common tests
-        is_training=True,
-        use_attention_mask=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=5,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=32,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        eos_token_id=1,
-        pad_token_id=0,
-        scope=None,
-        decoder_layers=None,
-        range_bbox=1000,
-        decoder_start_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.scope = None
-        self.decoder_layers = decoder_layers
-        self.range_bbox = range_bbox
-        self.decoder_start_token_id = decoder_start_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-        bbox = ids_tensor([self.batch_size, self.encoder_seq_length, 4], self.range_bbox).float()
-        # Ensure that bbox is legal
-        for i in range(bbox.shape[0]):
-            for j in range(bbox.shape[1]):
-                if bbox[i, j, 3] < bbox[i, j, 1]:
-                    t = bbox[i, j, 3]
-                    bbox[i, j, 3] = bbox[i, j, 1]
-                    bbox[i, j, 1] = t
-                if bbox[i, j, 2] < bbox[i, j, 0]:
-                    t = bbox[i, j, 2]
-                    bbox[i, j, 2] = bbox[i, j, 0]
-                    bbox[i, j, 0] = t
-        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        decoder_attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-            decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            bbox,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        )
-
-    def get_config(self):
-        return UdopConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        bbox,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = UdopModel(config=config)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            bbox=bbox,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        result = model(input_ids=input_ids, bbox=bbox, decoder_input_ids=decoder_input_ids)
-        decoder_output = result.last_hidden_state
-        decoder_past = result.past_key_values
-        encoder_output = result.encoder_last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-        self.parent.assertEqual(decoder_output.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size))
-        # There should be `num_layers` key value embeddings stored in decoder_past
-        self.parent.assertEqual(len(decoder_past), config.num_layers)
-        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
-        self.parent.assertEqual(len(decoder_past[0]), 4)
-
-    def create_and_check_with_lm_head(
-        self,
-        config,
-        input_ids,
-        bbox,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = UdopForConditionalGeneration(config=config).eval()
-        outputs = model(
-            input_ids=input_ids,
-            bbox=bbox,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            labels=lm_labels,
-        )
-        self.parent.assertEqual(len(outputs), 4)
-        self.parent.assertEqual(outputs["logits"].shape, (self.batch_size, self.decoder_seq_length, self.vocab_size))
-        self.parent.assertEqual(outputs["loss"].shape, ())
-
-    def create_and_check_generate_with_past_key_values(
-        self,
-        config,
-        input_ids,
-        bbox,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = UdopForConditionalGeneration(config=config).eval()
-        set_seed(0)
-        output_without_past_cache = model.generate(
-            input_ids[:1], bbox=bbox[:1, :, :], num_beams=2, max_length=5, do_sample=True, use_cache=False
-        )
-        set_seed(0)
-        output_with_past_cache = model.generate(
-            input_ids[:1], bbox=bbox[:1, :, :], num_beams=2, max_length=5, do_sample=True
-        )
-        self.parent.assertTrue(ops.all(output_with_past_cache == output_without_past_cache))
-
-    def create_and_check_model_fp16_forward(
-        self,
-        config,
-        input_ids,
-        bbox,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = UdopForConditionalGeneration(config=config).half().eval()
-        output = model(input_ids, bbox=bbox, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids).logits
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            bbox,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            lm_labels,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "bbox": bbox,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "use_cache": False,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class UdopModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            UdopModel,
-            UdopForConditionalGeneration,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (UdopForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"feature-extraction": UdopModel} if is_mindspore_available() else {}
-    fx_compatible = False
-    test_pruning = False
-    test_torchscript = False
-    test_head_masking = False
-    test_resize_embeddings = True
-    test_model_parallel = False
-    is_encoder_decoder = True
-    test_cpu_offload = False
-    # The small UDOP model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = UdopModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=UdopConfig, d_model=37)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-        if model_class.__name__ == "UdopForConditionalGeneration":
-            if return_labels:
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int64
-                )
-
-        return inputs_dict
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_with_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
-
-    def test_generate_with_past_key_values(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_generate_with_past_key_values(*config_and_inputs)
-
-    def test_model_fp16_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
-
-    @unittest.skip(reason="Gradient checkpointing is not supported by this model")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = sorted([*signature.parameters.keys()])
-
-            expected_arg_names = [
-                "attention_mask",
-                "bbox",
-                "cross_attn_head_mask",
-                "decoder_attention_mask",
-                "decoder_head_mask",
-                "decoder_input_ids",
-                "decoder_inputs_embeds",
-                "encoder_outputs",
-                "head_mask",
-                "input_ids",
-                "inputs_embeds",
-            ]
-            if model_class in self.all_generative_model_classes:
-                expected_arg_names.append(
-                    "labels",
-                )
-                expected_arg_names = sorted(expected_arg_names)
-            self.assertListEqual(sorted(arg_names[: len(expected_arg_names)]), expected_arg_names)
-
-    @unittest.skip(
-        "Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
-    )
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/udop-large"
-        model = UdopForConditionalGeneration.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class UdopEncoderOnlyModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        seq_length=7,
-        # For common tests
-        is_training=False,
-        use_attention_mask=True,
-        hidden_size=32,
-        num_hidden_layers=5,
-        decoder_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=32,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        eos_token_id=1,
-        pad_token_id=0,
-        scope=None,
-        range_bbox=1000,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        # For common tests
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.decoder_layers = decoder_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.scope = None
-        self.range_bbox = range_bbox
-
-    def get_config(self):
-        return UdopConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            is_encoder_decoder=False,
-        )
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox).float()
-        # Ensure that bbox is legal
-        for i in range(bbox.shape[0]):
-            for j in range(bbox.shape[1]):
-                if bbox[i, j, 3] < bbox[i, j, 1]:
-                    t = bbox[i, j, 3]
-                    bbox[i, j, 3] = bbox[i, j, 1]
-                    bbox[i, j, 1] = t
-                if bbox[i, j, 2] < bbox[i, j, 0]:
-                    t = bbox[i, j, 2]
-                    bbox[i, j, 2] = bbox[i, j, 0]
-                    bbox[i, j, 0] = t
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            bbox,
-            attention_mask,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            bbox,
-            attention_mask,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "bbox": bbox,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        bbox,
-        attention_mask,
-    ):
-        model = UdopEncoderModel(config=config)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            bbox=bbox,
-            attention_mask=attention_mask,
-        )
-        encoder_output = result.last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_fp16_forward(
-        self,
-        config,
-        input_ids,
-        bbox,
-        attention_mask,
-    ):
-        model = UdopEncoderModel(config=config).half().eval()
-        output = model(input_ids, bbox=bbox, attention_mask=attention_mask)["last_hidden_state"]
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-
-class UdopEncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (UdopEncoderModel,) if is_mindspore_available() else ()
-    test_pruning = False
-    test_torchscript = False
-    test_head_masking = False
-    test_resize_embeddings = False
-    test_model_parallel = False
-    all_parallelizable_model_classes = (UdopEncoderModel,) if is_mindspore_available() else ()
-
-    def setUp(self):
-        self.model_tester = UdopEncoderOnlyModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=UdopConfig, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(
-        "Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!"
-    )
-    def test_save_load_low_cpu_mem_usage(self):
-        pass
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-@require_vision
-@slow
-class UdopModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def image(self):
-        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa", filename="document_2.png", repo_type="dataset"
-        )
-        image = Image.open(filepath).convert("RGB")
-
-        return image
-
-    @cached_property
-    def processor(self):
-        return UdopProcessor.from_pretrained("microsoft/udop-large")
-
-    @cached_property
-    def model(self):
-        return UdopForConditionalGeneration.from_pretrained("microsoft/udop-large")
-
-    def test_conditional_generation(self):
-        processor = self.processor
-        model = self.model
-
-        prompt = "Question answering. In which year is the report made?"
-        encoding = processor(images=self.image, text=prompt, return_tensors="ms")
-
-        predicted_ids = model.generate(**encoding)
-
-        predicted_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-        self.assertEqual(predicted_text, "2013")
\ No newline at end of file
diff --git a/tests/transformers/models/udop/test_processor_udop.py b/tests/transformers/models/udop/test_processor_udop.py
deleted file mode 100644
index e161ca893..000000000
--- a/tests/transformers/models/udop/test_processor_udop.py
+++ /dev/null
@@ -1,510 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-from typing import List
-
-import numpy as np
-from mindnlp.transformers import (
-    PreTrainedTokenizer,
-    PreTrainedTokenizerBase,
-    PreTrainedTokenizerFast,
-    UdopTokenizer,
-    UdopTokenizerFast,
-)
-from mindnlp.utils.testing_utils import (
-    require_pytesseract,
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-from mindnlp.utils import cached_property,is_mindspore_available
-FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
-
-if is_mindspore_available():
-    import mindspore
-
-
-from PIL import Image
-from mindnlp.transformers import LayoutLMv3ImageProcessor, UdopProcessor
-
-
-@require_pytesseract
-@require_sentencepiece
-@require_tokenizers
-class UdopProcessorTest(unittest.TestCase):
-    tokenizer_class = UdopTokenizer
-    rust_tokenizer_class = UdopTokenizerFast
-    maxDiff = None
-
-    def setUp(self):
-        image_processor_map = {
-            "do_resize": True,
-            "size": 224,
-            "apply_ocr": True,
-        }
-
-        self.tmpdirname = tempfile.mkdtemp()
-        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
-        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(image_processor_map) + "\n")
-
-        self.tokenizer_pretrained_name = "microsoft/udop-large"
-
-    def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
-        return self.tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
-        return self.rust_tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs)
-
-    def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
-        return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
-
-    def get_image_processor(self, **kwargs):
-        return LayoutLMv3ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def prepare_image_inputs(self):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
-
-        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        return image_inputs
-
-    def test_save_load_pretrained_default(self):
-        image_processor = self.get_image_processor()
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-            processor.save_pretrained(self.tmpdirname)
-            processor = UdopProcessor.from_pretrained(self.tmpdirname)
-
-            self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-            self.assertIsInstance(processor.tokenizer, (UdopTokenizer, UdopTokenizerFast))
-
-            self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
-            self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = UdopProcessor(image_processor=self.get_image_processor(), tokenizer=self.get_tokenizer())
-        processor.save_pretrained(self.tmpdirname)
-
-        # slow tokenizer
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
-
-        processor = UdopProcessor.from_pretrained(
-            self.tmpdirname,
-            use_fast=False,
-            bos_token="(BOS)",
-            eos_token="(EOS)",
-            do_resize=False,
-            size=30,
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, UdopTokenizer)
-
-        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
-
-        # fast tokenizer
-        tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_image_processor(do_resize=False, size=30)
-
-        processor = UdopProcessor.from_pretrained(
-            self.tmpdirname, use_xlm=True, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, UdopTokenizerFast)
-
-        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, LayoutLMv3ImageProcessor)
-
-    def test_model_input_names(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = UdopProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertListEqual(list(inputs.keys()), processor.model_input_names)
-
-    def test_text_target(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = UdopProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        text = "hello world"
-        expected_decoding = "hello world</s>"
-
-        encoding_processor = processor(text_target=text)
-        encoding_tokenizer = tokenizer(text_target=text)
-
-        self.assertListEqual(encoding_processor["input_ids"], [21820, 296, 1])
-        self.assertListEqual(encoding_processor["attention_mask"], [1, 1, 1])
-        self.assertDictEqual(dict(encoding_processor), dict(encoding_tokenizer))
-        self.assertEqual(tokenizer.decode(encoding_processor["input_ids"]), expected_decoding)
-
-    @slow
-    def test_overflowing_tokens(self):
-        # In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences).
-
-        from datasets import load_dataset
-
-        # set up
-        datasets = load_dataset("nielsr/funsd", trust_remote_code=True)
-        processor = UdopProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
-
-        def preprocess_data(examples):
-            images = [Image.open(path).convert("RGB") for path in examples["image_path"]]
-            words = examples["words"]
-            boxes = examples["bboxes"]
-            word_labels = examples["ner_tags"]
-            encoded_inputs = processor(
-                images,
-                words,
-                boxes=boxes,
-                word_labels=word_labels,
-                max_length=512,
-                padding="max_length",
-                truncation=True,
-                return_overflowing_tokens=True,
-                stride=50,
-                return_offsets_mapping=True,
-                return_tensors="ms",
-            )
-            return encoded_inputs
-
-        train_data = preprocess_data(datasets["train"])
-
-        self.assertEqual(len(train_data["pixel_values"]), len(train_data["input_ids"]))
-
-
-# different use cases tests
-@require_sentencepiece
-@require_mindspore
-@require_pytesseract
-class UdopProcessorIntegrationTests(unittest.TestCase):
-    @cached_property
-    def get_images(self):
-        # we verify our implementation on 2 document images from the DocVQA dataset
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
-
-        image_1 = Image.open(ds[0]["file"]).convert("RGB")
-        image_2 = Image.open(ds[1]["file"]).convert("RGB")
-
-        return image_1, image_2
-
-    @cached_property
-    def get_tokenizers(self):
-        slow_tokenizer = UdopTokenizer.from_pretrained("microsoft/udop-large")
-        fast_tokenizer = UdopTokenizerFast.from_pretrained("microsoft/udop-large")
-        return [slow_tokenizer, fast_tokenizer]
-
-    @slow
-    def test_processor_case_1(self):
-        # case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
-
-        image_processor = LayoutLMv3ImageProcessor()
-        tokenizers = self.get_tokenizers
-        images = self.get_images
-
-        for tokenizer in tokenizers:
-            processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-            # not batched
-            input_image_processor = image_processor(images[0], return_tensors="ms")
-            input_processor = processor(images[0], return_tensors="ms")
-
-            # verify keys
-            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = sorted(input_processor.keys())
-            self.assertListEqual(actual_keys, expected_keys)
-
-            # verify pixel_values
-            self.assertTrue(
-                np.allclose(input_image_processor["pixel_values"], input_processor["pixel_values"], atol=1e-2)
-            )
-
-            # verify input_ids
-            # this was obtained with Tesseract 4.1.1
-            # fmt: off
-            expected_decoding = "11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>"  # noqa: E231
-            # fmt: on
-            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
-            self.assertSequenceEqual(decoding, expected_decoding)
-
-            # batched
-            input_image_processor = image_processor(images, return_tensors="ms")
-            input_processor = processor(images, padding=True, return_tensors="ms")
-
-            # verify keys
-            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = sorted(input_processor.keys())
-            self.assertListEqual(actual_keys, expected_keys)
-
-            # verify pixel_values
-            self.assertTrue(
-                np.allclose(input_image_processor["pixel_values"], input_processor["pixel_values"], atol=1e-2)
-            )
-
-            # verify input_ids
-            # this was obtained with Tesseract 4.1.1
-            # fmt: off
-            expected_decoding = "7 ITC Limited REPORT AND ACCOUNTS 2013 ITC’s Brands: An Asset for the Nation The consumer needs and aspirations they fulfil, the benefit they generate for millions across ITC’s value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITC’s brands national assets, adding to India’s competitiveness. It is ITC’s aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. DI WILLS * ; LOVE DELIGHTFULLY SOFT SKIN? aia Ans Source: https://www.industrydocuments.ucsf.edu/docs/snbx0223</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>"  # noqa: E231
-            # fmt: on
-            decoding = processor.decode(input_processor.input_ids[1].tolist())
-            self.assertSequenceEqual(decoding, expected_decoding)
-
-    @slow
-    def test_processor_case_2(self):
-        # case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
-
-        image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
-        tokenizers = self.get_tokenizers
-        images = self.get_images
-
-        for tokenizer in tokenizers:
-            processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-            # not batched
-            words = ["hello", "world"]
-            boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
-            input_processor = processor(images[0], words, boxes=boxes, return_tensors="ms")
-
-            # verify keys
-            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = list(input_processor.keys())
-            for key in expected_keys:
-                self.assertIn(key, actual_keys)
-
-            # verify input_ids
-            expected_decoding = "hello world</s>"
-            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
-            self.assertSequenceEqual(decoding, expected_decoding)
-
-            # batched
-            words = [["hello", "world"], ["my", "name", "is", "niels"]]
-            boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
-            input_processor = processor(images, words, boxes=boxes, padding=True, return_tensors="ms")
-
-            # verify keys
-            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = sorted(input_processor.keys())
-            self.assertListEqual(actual_keys, expected_keys)
-
-            # verify input_ids
-            expected_decoding = "hello world</s><pad><pad><pad><pad>"
-            decoding = processor.decode(input_processor.input_ids[0].tolist())
-            self.assertSequenceEqual(decoding, expected_decoding)
-
-            # verify bbox
-            expected_bbox = [
-                [3, 2, 5, 1],
-                [6, 7, 4, 2],
-                [3, 9, 2, 4],
-                [1, 1, 2, 3],
-                [1, 1, 2, 3],
-                [1, 1, 2, 3],
-                [1000, 1000, 1000, 1000],
-            ]
-            self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
-
-    @slow
-    def test_processor_case_3(self):
-        # case 3: token classification (training), apply_ocr=False
-
-        image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
-        tokenizers = self.get_tokenizers
-        images = self.get_images
-
-        for tokenizer in tokenizers:
-            processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-            # not batched
-            words = ["weirdly", "world"]
-            boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
-            word_labels = [1, 2]
-            input_processor = processor(images[0], words, boxes=boxes, word_labels=word_labels, return_tensors="ms")
-
-            # verify keys
-            expected_keys = ["attention_mask", "bbox", "input_ids", "labels", "pixel_values"]
-            actual_keys = sorted(input_processor.keys())
-            self.assertListEqual(actual_keys, expected_keys)
-
-            # verify input_ids
-            expected_decoding = "weirdly world</s>"
-            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
-            self.assertSequenceEqual(decoding, expected_decoding)
-
-            # verify labels
-            expected_labels = [1, -100, 2, -100]
-            self.assertListEqual(input_processor.labels.squeeze().tolist(), expected_labels)
-
-            # batched
-            words = [["hello", "world"], ["my", "name", "is", "niels"]]
-            boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
-            word_labels = [[1, 2], [6, 3, 10, 2]]
-            input_processor = processor(
-                images, words, boxes=boxes, word_labels=word_labels, padding=True, return_tensors="ms"
-            )
-
-            # verify keys
-            expected_keys = ["attention_mask", "bbox", "input_ids", "labels", "pixel_values"]
-            actual_keys = sorted(input_processor.keys())
-            self.assertListEqual(actual_keys, expected_keys)
-
-            # verify input_ids
-            expected_decoding = "my name is niels</s>"
-            decoding = processor.decode(input_processor.input_ids[1].tolist())
-            self.assertSequenceEqual(decoding, expected_decoding)
-
-            # verify bbox
-            expected_bbox = [
-                [3, 2, 5, 1],
-                [6, 7, 4, 2],
-                [3, 9, 2, 4],
-                [1, 1, 2, 3],
-                [1, 1, 2, 3],
-                [1, 1, 2, 3],
-                [1000, 1000, 1000, 1000],
-            ]
-            self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
-
-            # verify labels
-            expected_labels = [6, 3, 10, 2, -100, -100, -100]
-            self.assertListEqual(input_processor.labels[1].tolist(), expected_labels)
-
-    @slow
-    def test_processor_case_4(self):
-        # case 4: visual question answering (inference), apply_ocr=True
-
-        image_processor = LayoutLMv3ImageProcessor()
-        tokenizers = self.get_tokenizers
-        images = self.get_images
-
-        for tokenizer in tokenizers:
-            processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-            # not batched
-            question = "What's his name?"
-            input_processor = processor(images[0], question, return_tensors="ms")
-
-            # verify keys
-            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = sorted(input_processor.keys())
-            self.assertListEqual(actual_keys, expected_keys)
-
-            # verify input_ids
-            # this was obtained with Tesseract 4.1.1
-            # fmt: off
-            expected_decoding = "What's his name?</s> 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>"  # noqa: E231
-            # fmt: on
-            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
-            self.assertSequenceEqual(decoding, expected_decoding)
-
-            # batched
-            questions = ["How old is he?", "what's the time"]
-            input_processor = processor(
-                images, questions, padding="max_length", max_length=20, truncation=True, return_tensors="ms"
-            )
-
-            # verify keys
-            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = sorted(input_processor.keys())
-            self.assertListEqual(actual_keys, expected_keys)
-
-            # verify input_ids
-            # this was obtained with Tesseract 4.1.1
-            expected_decoding = "what's the time</s> 7 ITC Limited REPORT AND ACCOUNTS 2013 I</s>"
-            decoding = processor.decode(input_processor.input_ids[1].tolist())
-            self.assertSequenceEqual(decoding, expected_decoding)
-
-            # verify bbox
-            # fmt: off
-            expected_bbox = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [0, 45, 67, 80], [72, 56, 109, 67], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [372, 59, 407, 66], [74, 136, 161, 158], [1000, 1000, 1000, 1000]]  # noqa: E231
-            # fmt: on
-            self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
-
-    @slow
-    def test_processor_case_5(self):
-        # case 5: visual question answering (inference), apply_ocr=False
-
-        image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
-        tokenizers = self.get_tokenizers
-        images = self.get_images
-
-        for tokenizer in tokenizers:
-            processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-            # not batched
-            question = "What's his name?"
-            words = ["hello", "world"]
-            boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
-            input_processor = processor(images[0], question, words, boxes, return_tensors="ms")
-
-            # verify keys
-            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = sorted(input_processor.keys())
-            self.assertListEqual(actual_keys, expected_keys)
-
-            # verify input_ids
-            expected_decoding = "What's his name?</s> hello world</s>"
-            decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
-            self.assertSequenceEqual(decoding, expected_decoding)
-
-            # batched
-            questions = ["How old is he?", "what's the time"]
-            words = [["hello", "world"], ["my", "name", "is", "niels"]]
-            boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
-            input_processor = processor(images, questions, words, boxes, padding=True, return_tensors="ms")
-
-            # verify keys
-            expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
-            actual_keys = sorted(input_processor.keys())
-            self.assertListEqual(actual_keys, expected_keys)
-
-            # verify input_ids
-            expected_decoding = "How old is he?</s> hello world</s><pad><pad><pad>"
-            decoding = processor.decode(input_processor.input_ids[0].tolist())
-            self.assertSequenceEqual(decoding, expected_decoding)
-
-            expected_decoding = "what's the time</s> my name is niels</s>"
-            decoding = processor.decode(input_processor.input_ids[1].tolist())
-            self.assertSequenceEqual(decoding, expected_decoding)
-
-            # verify bbox
-            expected_bbox = [[3, 9, 2, 4], [1, 1, 2, 3], [1, 1, 2, 3], [1, 1, 2, 3], [1000, 1000, 1000, 1000]]
-            self.assertListEqual(input_processor.bbox[1].tolist()[-5:], expected_bbox)
diff --git a/tests/transformers/models/udop/test_tokenization_udop.py b/tests/transformers/models/udop/test_tokenization_udop.py
deleted file mode 100644
index e57146aef..000000000
--- a/tests/transformers/models/udop/test_tokenization_udop.py
+++ /dev/null
@@ -1,1956 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import shutil
-import tempfile
-import unittest
-from typing import List
-from tokenizers import AddedToken
-from mindnlp.transformers import (
-    SpecialTokensMixin,
-    UdopTokenizer,
-    UdopTokenizerFast,
-    logging,
-)
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    get_tests_dir,
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-
-from ...test_tokenization_common import (
-    SMALL_TRAINING_CORPUS,
-    TokenizerTesterMixin,
-    filter_non_english,
-    merge_model_tokenizer_mappings,
-)
-
-
-logger = logging.get_logger(__name__)
-SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
-
-
-@require_sentencepiece
-@require_tokenizers
-class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "microsoft/udop-large"
-    tokenizer_class = UdopTokenizer
-    rust_tokenizer_class = UdopTokenizerFast
-    test_rust_tokenizer = True
-    from_pretrained_filter = filter_non_english
-    test_seq2seq = False
-    test_sentencepiece = True
-
-    def get_words_and_boxes(self):
-        words = ["a", "weirdly", "test", "hello"]
-        boxes = [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129], [961, 885, 992, 912]]
-
-        return words, boxes
-
-    def get_words_and_boxes_batch(self):
-        words = [["a", "weirdly", "test"], ["hello", "my", "name", "is", "bob"]]
-        boxes = [
-            [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]],
-            [[961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69]],
-        ]
-
-        return words, boxes
-
-    def get_question_words_and_boxes(self):
-        question = "what's his name?"
-        words = ["a", "weirdly", "test"]
-        boxes = [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]]
-
-        return question, words, boxes
-
-    def get_question_words_and_boxes_batch(self):
-        questions = ["what's his name?", "how is he called?"]
-        words = [["a", "weirdly", "test"], ["what", "a", "laif", "gastn"]]
-        boxes = [
-            [[423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129]],
-            [[256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69]],
-        ]
-
-        return questions, words, boxes
-
-    def setUp(self):
-        super().setUp()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = UdopTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00e9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    # override test in `test_tokenization_common.py` because of the required input format of the `__call__`` method of
-    # this tokenizer
-    def test_save_sentencepiece_tokenizer(self) -> None:
-        if not self.test_sentencepiece or not self.test_slow_tokenizer:
-            self.skipTest(reason="test_sentencepiece or test_slow_tokenizer is set to False")
-        # We want to verify that we will be able to save the tokenizer even if the original files that were used to
-        # build the tokenizer have been deleted in the meantime.
-        words, boxes = self.get_words_and_boxes()
-
-        tokenizer_slow_1 = self.get_tokenizer()
-        encoding_tokenizer_slow_1 = tokenizer_slow_1(
-            words,
-            boxes=boxes,
-        )
-
-        tmpdirname_1 = tempfile.mkdtemp()
-        tmpdirname_2 = tempfile.mkdtemp()
-
-        tokenizer_slow_1.save_pretrained(tmpdirname_1)
-        tokenizer_slow_2 = self.tokenizer_class.from_pretrained(tmpdirname_1)
-        encoding_tokenizer_slow_2 = tokenizer_slow_2(
-            words,
-            boxes=boxes,
-        )
-
-        shutil.rmtree(tmpdirname_1)
-        tokenizer_slow_2.save_pretrained(tmpdirname_2)
-
-        tokenizer_slow_3 = self.tokenizer_class.from_pretrained(tmpdirname_2)
-        encoding_tokenizer_slow_3 = tokenizer_slow_3(
-            words,
-            boxes=boxes,
-        )
-        shutil.rmtree(tmpdirname_2)
-
-        self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_2)
-        self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_3)
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("microsoft/udop-large")
-
-        question, words, boxes = self.get_question_words_and_boxes()
-
-        text = tokenizer.encode_boxes(
-            question.split(),
-            boxes=[tokenizer.pad_token_box for _ in range(len(question.split()))],
-            add_special_tokens=False,
-        )
-        text_2 = tokenizer.encode_boxes(words, boxes=boxes, add_special_tokens=False)
-
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_pair == text + [1] + text_2 + [1]
-
-    def test_add_special_tokens(self):
-        tokenizers: List[UdopTokenizer] = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                special_token = "[SPECIAL_TOKEN]"
-                special_token_box = [1000, 1000, 1000, 1000]
-
-                tokenizer.add_special_tokens({"cls_token": special_token})
-                encoded_special_token = tokenizer.encode_boxes(
-                    [special_token], boxes=[special_token_box], add_special_tokens=False
-                )
-                self.assertEqual(len(encoded_special_token), 1)
-
-                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
-                self.assertTrue(special_token not in decoded)
-
-    def test_add_tokens_tokenizer(self):
-        tokenizers: List[UdopTokenizer] = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                vocab_size = tokenizer.vocab_size
-                all_size = len(tokenizer)
-
-                self.assertNotEqual(vocab_size, 0)
-
-                # We usually have added tokens from the start in tests because our vocab fixtures are
-                # smaller than the original vocabs - let's not assert this
-                # self.assertEqual(vocab_size, all_size)
-
-                new_toks = ["aaaaa", "bbbbbb", "cccccccccdddddddd"]
-                added_toks = tokenizer.add_tokens(new_toks)
-                vocab_size_2 = tokenizer.vocab_size
-                all_size_2 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_2, 0)
-                self.assertEqual(vocab_size, vocab_size_2)
-                self.assertEqual(added_toks, len(new_toks))
-                self.assertEqual(all_size_2, all_size + len(new_toks))
-
-                words = "aaaaa bbbbbb low cccccccccdddddddd l".split()
-                boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
-
-                tokens = tokenizer.encode_boxes(words, boxes=boxes, add_special_tokens=False)
-
-                self.assertGreaterEqual(len(tokens), 4)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-
-                new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
-                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-                vocab_size_3 = tokenizer.vocab_size
-                all_size_3 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_3, 0)
-                self.assertEqual(vocab_size, vocab_size_3)
-                self.assertEqual(added_toks_2, len(new_toks_2))
-                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-
-                words = ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l".split()
-                boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
-
-                tokens = tokenizer.encode_boxes(
-                    words,
-                    boxes=boxes,
-                    add_special_tokens=False,
-                )
-
-                self.assertGreaterEqual(len(tokens), 6)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[0], tokens[1])
-                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-2], tokens[-3])
-                self.assertEqual(tokens[0], tokenizer.eos_token_id)
-                self.assertEqual(tokens[-2], tokenizer.pad_token_id)
-
-    @require_tokenizers
-    def test_encode_decode_with_spaces(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes()
-
-                new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
-                tokenizer.add_tokens(new_toks)
-                input = "[ABC][DEF][ABC][DEF]"
-                if self.space_between_special_tokens:
-                    output = "[ABC] [DEF] [ABC] [DEF]"
-                else:
-                    output = input
-                encoded = tokenizer.encode_boxes(input.split(), boxes=boxes, add_special_tokens=False)
-                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
-                self.assertIn(decoded, [output, output.lower()])
-
-    def test_encode_plus_with_padding(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes()
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, words)
-
-                padding_size = 10
-                padding_idx = tokenizer.pad_token_id
-
-                encoded_sequence = tokenizer.encode_plus_boxes(words, boxes=boxes, return_special_tokens_mask=True)
-                input_ids = encoded_sequence["input_ids"]
-                special_tokens_mask = encoded_sequence["special_tokens_mask"]
-                sequence_length = len(input_ids)
-
-                # Test 'longest' and 'no_padding' don't do anything
-                tokenizer.padding_side = "right"
-
-                not_padded_sequence = tokenizer.encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    padding=False,
-                    return_special_tokens_mask=True,
-                )
-                not_padded_input_ids = not_padded_sequence["input_ids"]
-
-                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
-                not_padded_sequence_length = len(not_padded_input_ids)
-
-                self.assertTrue(sequence_length == not_padded_sequence_length)
-                self.assertTrue(input_ids == not_padded_input_ids)
-                self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
-
-                not_padded_sequence = tokenizer.encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    padding=False,
-                    return_special_tokens_mask=True,
-                )
-                not_padded_input_ids = not_padded_sequence["input_ids"]
-
-                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
-                not_padded_sequence_length = len(not_padded_input_ids)
-
-                self.assertTrue(sequence_length == not_padded_sequence_length)
-                self.assertTrue(input_ids == not_padded_input_ids)
-                self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
-
-                # Test right padding
-                tokenizer.padding_side = "right"
-
-                right_padded_sequence = tokenizer.encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
-                right_padded_input_ids = right_padded_sequence["input_ids"]
-
-                right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
-                right_padded_sequence_length = len(right_padded_input_ids)
-
-                self.assertTrue(sequence_length + padding_size == right_padded_sequence_length)
-                self.assertTrue(input_ids + [padding_idx] * padding_size == right_padded_input_ids)
-                self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
-
-                # Test left padding
-                tokenizer.padding_side = "left"
-                left_padded_sequence = tokenizer.encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
-                left_padded_input_ids = left_padded_sequence["input_ids"]
-                left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
-                left_padded_sequence_length = len(left_padded_input_ids)
-
-                self.assertTrue(sequence_length + padding_size == left_padded_sequence_length)
-                self.assertTrue([padding_idx] * padding_size + input_ids == left_padded_input_ids)
-                self.assertTrue([1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask)
-
-                if "token_type_ids" in tokenizer.model_input_names:
-                    token_type_ids = encoded_sequence["token_type_ids"]
-                    left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
-                    right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
-
-                    assert token_type_ids + [0] * padding_size == right_padded_token_type_ids
-                    assert [0] * padding_size + token_type_ids == left_padded_token_type_ids
-
-                if "attention_mask" in tokenizer.model_input_names:
-                    attention_mask = encoded_sequence["attention_mask"]
-                    right_padded_attention_mask = right_padded_sequence["attention_mask"]
-                    left_padded_attention_mask = left_padded_sequence["attention_mask"]
-
-                    self.assertTrue(attention_mask + [0] * padding_size == right_padded_attention_mask)
-                    self.assertTrue([0] * padding_size + attention_mask == left_padded_attention_mask)
-
-    def test_internal_consistency(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes()
-
-                tokens = []
-                for word in words:
-                    tokens.extend(tokenizer.tokenize(word))
-                ids = tokenizer.convert_tokens_to_ids(tokens)
-                ids_2 = tokenizer.encode_boxes(words, boxes=boxes, add_special_tokens=False)
-                self.assertListEqual(ids, ids_2)
-
-                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
-                self.assertNotEqual(len(tokens_2), 0)
-                text_2 = tokenizer.decode(ids)
-                self.assertIsInstance(text_2, str)
-
-                output_text = "a weirdly test hello"
-                self.assertEqual(text_2, output_text)
-
-    def test_mask_output(self):
-        tokenizers = self.get_tokenizers(fast=False, do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes()
-
-                if (
-                    tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
-                    and "token_type_ids" in tokenizer.model_input_names
-                ):
-                    information = tokenizer.encode_plus_boxes(words, boxes=boxes, add_special_tokens=True)
-                    sequences, mask = information["input_ids"], information["token_type_ids"]
-                    self.assertEqual(len(sequences), len(mask))
-
-    def test_number_of_added_tokens(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # test 1: single sequence
-                words, boxes = self.get_words_and_boxes()
-
-                sequences = tokenizer.encode_boxes(words, boxes=boxes, add_special_tokens=False)
-                attached_sequences = tokenizer.encode_boxes(words, boxes=boxes, add_special_tokens=True)
-
-                # Method is implemented (e.g. not GPT-2)
-                if len(attached_sequences) != 2:
-                    self.assertEqual(
-                        tokenizer.num_special_tokens_to_add(pair=False), len(attached_sequences) - len(sequences)
-                    )
-
-                # test 2: two sequences
-                question, words, boxes = self.get_question_words_and_boxes()
-
-                sequences = tokenizer.encode_boxes(question, words, boxes=boxes, add_special_tokens=False)
-                attached_sequences = tokenizer.encode_boxes(question, words, boxes=boxes, add_special_tokens=True)
-
-                # Method is implemented (e.g. not GPT-2)
-                if len(attached_sequences) != 2:
-                    self.assertEqual(
-                        tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
-                    )
-
-    def test_padding_to_max_length(self):
-        """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes()
-                padding_size = 10
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, words)
-
-                padding_idx = tokenizer.pad_token_id
-
-                # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "right"
-                encoded_sequence = tokenizer.encode_boxes(words, boxes=boxes)
-                sequence_length = len(encoded_sequence)
-                # FIXME: the next line should be padding(max_length) to avoid warning
-                padded_sequence = tokenizer.encode_boxes(
-                    words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True
-                )
-                padded_sequence_length = len(padded_sequence)
-                assert sequence_length + padding_size == padded_sequence_length
-                assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
-
-                # Check that nothing is done when a maximum length is not specified
-                encoded_sequence = tokenizer.encode_boxes(words, boxes=boxes)
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode_boxes(words, boxes=boxes, pad_to_max_length=True)
-                padded_sequence_right_length = len(padded_sequence_right)
-                assert sequence_length == padded_sequence_right_length
-                assert encoded_sequence == padded_sequence_right
-
-    def test_padding(self, max_length=50):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
-                pad_token_id = tokenizer_p.pad_token_id
-
-                # Encode - Simple input
-                words, boxes = self.get_words_and_boxes()
-                input_r = tokenizer_r.encode_boxes(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
-                input_p = tokenizer_p.encode_boxes(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
-                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
-                input_r = tokenizer_r.encode_boxes(words, boxes=boxes, max_length=max_length, padding="max_length")
-                input_p = tokenizer_p.encode_boxes(words, boxes=boxes, max_length=max_length, padding="max_length")
-                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-                input_r = tokenizer_r.encode_boxes(words, boxes=boxes, padding="longest")
-                input_p = tokenizer_p.encode_boxes(words, boxes=boxes, padding=True)
-                self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
-
-                # Encode - Pair input
-                question, words, boxes = self.get_question_words_and_boxes()
-                input_r = tokenizer_r.encode_boxes(
-                    question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
-                )
-                input_p = tokenizer_p.encode_boxes(
-                    question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
-                )
-                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
-                input_r = tokenizer_r.encode_boxes(
-                    question, words, boxes=boxes, max_length=max_length, padding="max_length"
-                )
-                input_p = tokenizer_p.encode_boxes(
-                    question, words, boxes=boxes, max_length=max_length, padding="max_length"
-                )
-                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
-                input_r = tokenizer_r.encode_boxes(question, words, boxes=boxes, padding=True)
-                input_p = tokenizer_p.encode_boxes(question, words, boxes=boxes, padding="longest")
-                self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
-
-                # Encode_plus - Simple input
-                words, boxes = self.get_words_and_boxes()
-                input_r = tokenizer_r.encode_plus_boxes(
-                    words, boxes=boxes, max_length=max_length, pad_to_max_length=True
-                )
-                input_p = tokenizer_p.encode_plus_boxes(
-                    words, boxes=boxes, max_length=max_length, pad_to_max_length=True
-                )
-                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
-                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-                input_r = tokenizer_r.encode_plus_boxes(
-                    words, boxes=boxes, max_length=max_length, padding="max_length"
-                )
-                input_p = tokenizer_p.encode_plus_boxes(
-                    words, boxes=boxes, max_length=max_length, padding="max_length"
-                )
-                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
-                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-
-                input_r = tokenizer_r.encode_plus_boxes(words, boxes=boxes, padding="longest")
-                input_p = tokenizer_p.encode_plus_boxes(words, boxes=boxes, padding=True)
-                self.assert_padded_input_match(
-                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
-                )
-
-                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-
-                # Encode_plus - Pair input
-                question, words, boxes = self.get_question_words_and_boxes()
-                input_r = tokenizer_r.encode_plus_boxes(
-                    question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
-                )
-                input_p = tokenizer_p.encode_plus_boxes(
-                    question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
-                )
-                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
-                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-                input_r = tokenizer_r.encode_plus_boxes(
-                    question, words, boxes=boxes, max_length=max_length, padding="max_length"
-                )
-                input_p = tokenizer_p.encode_plus_boxes(
-                    question, words, boxes=boxes, max_length=max_length, padding="max_length"
-                )
-                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
-                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-                input_r = tokenizer_r.encode_plus_boxes(question, words, boxes=boxes, padding="longest")
-                input_p = tokenizer_p.encode_plus_boxes(question, words, boxes=boxes, padding=True)
-                self.assert_padded_input_match(
-                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
-                )
-                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-
-                # Batch_encode_plus - Simple input
-                words, boxes = self.get_words_and_boxes_batch()
-
-                input_r = tokenizer_r.batch_encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    max_length=max_length,
-                    pad_to_max_length=True,
-                )
-                input_p = tokenizer_p.batch_encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    max_length=max_length,
-                    pad_to_max_length=True,
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-                input_r = tokenizer_r.batch_encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    max_length=max_length,
-                    padding="max_length",
-                )
-                input_p = tokenizer_p.batch_encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    max_length=max_length,
-                    padding="max_length",
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-                input_r = tokenizer_r.batch_encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    max_length=max_length,
-                    padding="longest",
-                )
-                input_p = tokenizer_p.batch_encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    max_length=max_length,
-                    padding=True,
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
-
-                input_r = tokenizer_r.batch_encode_plus_boxes(words, boxes=boxes, padding="longest")
-                input_p = tokenizer_p.batch_encode_plus_boxes(words, boxes=boxes, padding=True)
-                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
-
-                # Batch_encode_plus - Pair input
-                questions, words, boxes = self.get_question_words_and_boxes_batch()
-
-                input_r = tokenizer_r.batch_encode_plus_boxes(
-                    list(zip(questions, words)),
-                    is_pair=True,
-                    boxes=boxes,
-                    max_length=max_length,
-                    truncation=True,
-                    padding="max_length",
-                )
-                input_p = tokenizer_p.batch_encode_plus_boxes(
-                    list(zip(questions, words)),
-                    is_pair=True,
-                    boxes=boxes,
-                    max_length=max_length,
-                    truncation=True,
-                    padding="max_length",
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-                input_r = tokenizer_r.batch_encode_plus_boxes(
-                    list(zip(questions, words)),
-                    is_pair=True,
-                    boxes=boxes,
-                    padding=True,
-                )
-                input_p = tokenizer_p.batch_encode_plus_boxes(
-                    list(zip(questions, words)),
-                    is_pair=True,
-                    boxes=boxes,
-                    padding="longest",
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
-
-                # Using pad on single examples after tokenization
-                words, boxes = self.get_words_and_boxes()
-                input_r = tokenizer_r.encode_plus_boxes(words, boxes=boxes)
-                input_r = tokenizer_r.pad(input_r)
-
-                input_p = tokenizer_r.encode_plus_boxes(words, boxes=boxes)
-                input_p = tokenizer_r.pad(input_p)
-
-                self.assert_padded_input_match(
-                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
-                )
-
-                # Using pad on single examples after tokenization
-                input_r = tokenizer_r.encode_plus_boxes(words, boxes=boxes)
-                input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
-
-                input_p = tokenizer_r.encode_plus_boxes(words, boxes=boxes)
-                input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
-
-                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
-
-                # Using pad after tokenization
-                words, boxes = self.get_words_and_boxes_batch()
-                input_r = tokenizer_r.batch_encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                )
-                input_r = tokenizer_r.pad(input_r)
-
-                input_p = tokenizer_r.batch_encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                )
-                input_p = tokenizer_r.pad(input_p)
-
-                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
-
-                # Using pad after tokenization
-                words, boxes = self.get_words_and_boxes_batch()
-                input_r = tokenizer_r.batch_encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                )
-                input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
-
-                input_p = tokenizer_r.batch_encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                )
-                input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
-
-                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-    def test_padding_warning_message_fast_tokenizer(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        words, boxes = self.get_words_and_boxes_batch()
-
-        tokenizer_fast = self.get_rust_tokenizer()
-
-        encoding_fast = tokenizer_fast(
-            words,
-            boxes=boxes,
-        )
-        with self.assertLogs("mindnlp.transformers", level="WARNING") as cm:
-            tokenizer_fast.pad(encoding_fast)
-        self.assertEqual(len(cm.records), 1)
-        self.assertIn(
-            "Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to"
-            " encode the text followed by a call to the `pad` method to get a padded encoding.",
-            cm.records[0].message,
-        )
-
-        if not self.test_slow_tokenizer:
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        tokenizer_slow = self.get_tokenizer()
-
-        encoding_slow = tokenizer_slow(
-            words,
-            boxes=boxes,
-        )
-
-        with self.assertLogs(level="WARNING") as cm:
-            # We want to assert there are no warnings, but the 'assertLogs' method does not support that.
-            # Therefore, we are adding a dummy warning, and then we will assert it is the only warning.
-            logger.warning("Dummy warning")
-            tokenizer_slow.pad(encoding_slow)
-        self.assertEqual(len(cm.records), 1)
-        self.assertIn(
-            "Dummy warning",
-            cm.records[0].message,
-        )
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Test not batched
-                words, boxes = self.get_words_and_boxes()
-                encoded_sequences_1 = tokenizer.encode_plus_boxes(words, boxes=boxes)
-                encoded_sequences_2 = tokenizer(words, boxes=boxes)
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test not batched pairs
-                question, words, boxes = self.get_question_words_and_boxes()
-                encoded_sequences_1 = tokenizer.encode_plus_boxes(words, boxes=boxes)
-                encoded_sequences_2 = tokenizer(words, boxes=boxes)
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test batched
-                words, boxes = self.get_words_and_boxes_batch()
-                encoded_sequences_1 = tokenizer.batch_encode_plus_boxes(words, is_pair=False, boxes=boxes)
-                encoded_sequences_2 = tokenizer(words, boxes=boxes)
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-    def test_batch_encode_plus_batch_sequence_length(self):
-        # Tests that all encoded values have the correct size
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes_batch()
-
-                encoded_sequences = [
-                    tokenizer.encode_plus_boxes(words_example, boxes=boxes_example)
-                    for words_example, boxes_example in zip(words, boxes)
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus_boxes(
-                    words, is_pair=False, boxes=boxes, padding=False
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-                maximum_length = len(
-                    max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
-                )
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, words)
-
-                encoded_sequences_padded = [
-                    tokenizer.encode_plus_boxes(
-                        words_example, boxes=boxes_example, max_length=maximum_length, padding="max_length"
-                    )
-                    for words_example, boxes_example in zip(words, boxes)
-                ]
-
-                encoded_sequences_batch_padded = tokenizer.batch_encode_plus_boxes(
-                    words, is_pair=False, boxes=boxes, padding=True
-                )
-                self.assertListEqual(
-                    encoded_sequences_padded,
-                    self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
-                )
-
-                # check 'longest' is unsensitive to a max length
-                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus_boxes(
-                    words, is_pair=False, boxes=boxes, padding=True
-                )
-                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus_boxes(
-                    words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding="longest"
-                )
-                for key in encoded_sequences_batch_padded_1.keys():
-                    self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key],
-                        encoded_sequences_batch_padded_2[key],
-                    )
-
-                # check 'no_padding' is unsensitive to a max length
-                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus_boxes(
-                    words, is_pair=False, boxes=boxes, padding=False
-                )
-                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus_boxes(
-                    words, is_pair=False, boxes=boxes, max_length=maximum_length + 10, padding=False
-                )
-                for key in encoded_sequences_batch_padded_1.keys():
-                    self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key],
-                        encoded_sequences_batch_padded_2[key],
-                    )
-
-    @unittest.skip(reason="batch_encode_plus does not handle overflowing tokens.")
-    def test_batch_encode_plus_overflowing_tokens(self):
-        pass
-
-    def test_batch_encode_plus_padding(self):
-        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
-
-        # Right padding tests
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes_batch()
-
-                max_length = 100
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, words)
-
-                encoded_sequences = [
-                    tokenizer.encode_plus_boxes(
-                        words_example, boxes=boxes_example, max_length=max_length, padding="max_length"
-                    )
-                    for words_example, boxes_example in zip(words, boxes)
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus_boxes(
-                    words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length"
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-        # Left padding tests
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                tokenizer.padding_side = "left"
-                words, boxes = self.get_words_and_boxes_batch()
-
-                max_length = 100
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, words)
-
-                encoded_sequences = [
-                    tokenizer.encode_plus_boxes(
-                        words_example, boxes=boxes_example, max_length=max_length, padding="max_length"
-                    )
-                    for words_example, boxes_example in zip(words, boxes)
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus_boxes(
-                    words, is_pair=False, boxes=boxes, max_length=max_length, padding="max_length"
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-    def test_padding_to_multiple_of(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if tokenizer.pad_token is None:
-                    self.skipTest(reason="No padding token.")
-                else:
-                    words, boxes = self.get_words_and_boxes()
-
-                    normal_tokens = tokenizer(words, boxes=boxes, padding=True, pad_to_multiple_of=8)
-
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    normal_tokens = tokenizer(words, boxes=boxes, pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    # Should also work with truncation
-                    normal_tokens = tokenizer(words, boxes=boxes, padding=True, truncation=True, pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    # truncation to something which is not a multiple of pad_to_multiple_of raises an error
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.__call__,
-                        words,
-                        boxes=boxes,
-                        padding=True,
-                        truncation=True,
-                        max_length=12,
-                        pad_to_multiple_of=8,
-                    )
-
-    def test_tokenizer_slow_store_full_signature(self):
-        signature = inspect.signature(self.tokenizer_class.__init__)
-        tokenizer = self.get_tokenizer()
-
-        for parameter_name, parameter in signature.parameters.items():
-            if parameter.default != inspect.Parameter.empty:
-                self.assertIn(parameter_name, tokenizer.init_kwargs)
-
-    def test_build_inputs_with_special_tokens(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                # Input tokens id
-                words, boxes = self.get_words_and_boxes()
-                input_simple = tokenizer_p.encode_boxes(words, boxes=boxes, add_special_tokens=False)
-                input_pair = tokenizer_p.encode_boxes(words, boxes=boxes, add_special_tokens=False)
-
-                # Generate output
-                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
-                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
-                self.assertEqual(output_p, output_r)
-
-                # Generate pair output
-                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
-                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
-                self.assertEqual(output_p, output_r)
-
-    def test_special_tokens_mask_input_pairs(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes()
-                encoded_sequence = tokenizer.encode_boxes(words, boxes=boxes, add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    add_special_tokens=True,
-                    return_special_tokens_mask=True,
-                    # add_prefix_space=False,
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [
-                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
-                ]
-                filtered_sequence = [x for x in filtered_sequence if x is not None]
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    def test_special_tokens_mask(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes()
-                # Testing single inputs
-                encoded_sequence = tokenizer.encode_boxes(words, boxes=boxes, add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus_boxes(
-                    words, boxes=boxes, add_special_tokens=True, return_special_tokens_mask=True
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    def test_save_and_load_tokenizer(self):
-        # safety check on max_len default value so we are sure the test works
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                self.assertNotEqual(tokenizer.model_max_length, 42)
-
-        # Now let's start the test
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Isolate this from the other tests because we save additional tokens/etc
-                words, boxes = self.get_words_and_boxes()
-                tmpdirname = tempfile.mkdtemp()
-
-                before_tokens = tokenizer.encode_boxes(words, boxes=boxes, add_special_tokens=False)
-                before_vocab = tokenizer.get_vocab()
-                tokenizer.save_pretrained(tmpdirname)
-
-                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
-                after_tokens = after_tokenizer.encode_boxes(words, boxes=boxes, add_special_tokens=False)
-                after_vocab = after_tokenizer.get_vocab()
-                self.assertListEqual(before_tokens, after_tokens)
-                self.assertDictEqual(before_vocab, after_vocab)
-
-                shutil.rmtree(tmpdirname)
-
-    @unittest.skip(reason="Not implemented")
-    def test_right_and_left_truncation(self):
-        pass
-
-    def test_right_and_left_padding(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes()
-                sequence = "Sequence"
-                padding_size = 10
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_idx = tokenizer.pad_token_id
-
-                # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "right"
-                encoded_sequence = tokenizer.encode_boxes(words, boxes=boxes)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode_boxes(
-                    words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length"
-                )
-                padded_sequence_length = len(padded_sequence)
-                assert sequence_length + padding_size == padded_sequence_length
-                assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
-
-                # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "left"
-                encoded_sequence = tokenizer.encode_boxes(words, boxes=boxes)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode_boxes(
-                    words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length"
-                )
-                padded_sequence_length = len(padded_sequence)
-                assert sequence_length + padding_size == padded_sequence_length
-                assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
-
-                # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
-                encoded_sequence = tokenizer.encode_boxes(words, boxes=boxes)
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode_boxes(words, boxes=boxes, padding=True)
-                padded_sequence_right_length = len(padded_sequence_right)
-                assert sequence_length == padded_sequence_right_length
-                assert encoded_sequence == padded_sequence_right
-
-                tokenizer.padding_side = "left"
-                padded_sequence_left = tokenizer.encode_boxes(words, boxes=boxes, padding="longest")
-                padded_sequence_left_length = len(padded_sequence_left)
-                assert sequence_length == padded_sequence_left_length
-                assert encoded_sequence == padded_sequence_left
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode_boxes(words, boxes=boxes)
-                padded_sequence_right_length = len(padded_sequence_right)
-                assert sequence_length == padded_sequence_right_length
-                assert encoded_sequence == padded_sequence_right
-
-                tokenizer.padding_side = "left"
-                padded_sequence_left = tokenizer.encode_boxes(words, boxes=boxes, padding=False)
-                padded_sequence_left_length = len(padded_sequence_left)
-                assert sequence_length == padded_sequence_left_length
-                assert encoded_sequence == padded_sequence_left
-
-    def test_token_type_ids(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # test 1: single sequence
-                words, boxes = self.get_words_and_boxes()
-
-                output = tokenizer(words, boxes=boxes, return_token_type_ids=True)
-
-                # Assert that the token type IDs have the same length as the input IDs
-                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
-
-                # Assert that the token type IDs have the same length as the attention mask
-                self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"]))
-
-                self.assertIn(0, output["token_type_ids"])
-                self.assertNotIn(1, output["token_type_ids"])
-
-                # test 2: two sequences (question + words)
-                question, words, boxes = self.get_question_words_and_boxes()
-
-                output = tokenizer(question, words, boxes, return_token_type_ids=True)
-
-                # Assert that the token type IDs have the same length as the input IDs
-                self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
-
-                # Assert that the token type IDs have the same length as the attention mask
-                self.assertEqual(len(output["token_type_ids"]), len(output["attention_mask"]))
-
-                self.assertIn(0, output["token_type_ids"])
-                self.assertNotIn(1, output["token_type_ids"])
-
-    def test_offsets_mapping(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                text = ["a", "wonderful", "test"]
-                boxes = [[1, 8, 12, 20] for _ in range(len(text))]
-
-                # No pair
-                tokens_with_offsets = tokenizer_r.encode_plus_boxes(
-                    text,
-                    boxes=boxes,
-                    return_special_tokens_mask=True,
-                    return_offsets_mapping=True,
-                    add_special_tokens=True,
-                )
-                added_tokens = tokenizer_r.num_special_tokens_to_add(False)
-                offsets = tokens_with_offsets["offset_mapping"]
-
-                # Assert there is the same number of tokens and offsets
-                self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
-
-                # Assert there is online added_tokens special_tokens
-                self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
-
-                # Pairs
-                text = "what's his name"
-                pair = ["a", "wonderful", "test"]
-                boxes = [[1, 8, 12, 20] for _ in range(len(pair))]
-                tokens_with_offsets = tokenizer_r.encode_plus_boxes(
-                    text,
-                    pair,
-                    boxes=boxes,
-                    return_special_tokens_mask=True,
-                    return_offsets_mapping=True,
-                    add_special_tokens=True,
-                )
-                added_tokens = tokenizer_r.num_special_tokens_to_add(True)
-                offsets = tokens_with_offsets["offset_mapping"]
-
-                # Assert there is the same number of tokens and offsets
-                self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
-
-                # Assert there is online added_tokens special_tokens
-                self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
-
-    @unittest.skip(reason="Chat template tests don't play well with table/layout models.")
-    def test_chat_template(self):
-        pass
-
-    @unittest.skip(reason="Chat template tests don't play well with table/layout models.")
-    def test_chat_template_batched(self):
-        pass
-
-    @require_mindspore
-    @slow
-    def test_torch_encode_plus_sent_to_model(self):
-        import mindspore
-
-        from mindnlp.transformers import MODEL_MAPPING, TOKENIZER_MAPPING
-
-        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
-
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    self.skipTest(f"{tokenizer.__class__} not in MODEL_TOKENIZER_MAPPING")
-
-                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
-                config = config_class()
-
-                if config.is_encoder_decoder or config.pad_token_id is None:
-                    self.skipTest(reason="Model is an encoder-decoder or has no padding token set.")
-
-                model = model_class(config)
-
-                # Make sure the model contains at least the full vocabulary size in its embedding matrix
-                is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight")
-                assert (
-                    (model.get_input_embeddings().weight.shape[0] >= len(tokenizer))
-                    if is_using_common_embeddings
-                    else True
-                )
-
-                # Build sequence
-                words, boxes = self.get_words_and_boxes()
-                encoded_sequence = tokenizer.encode_plus_boxes(words, boxes=boxes, return_tensors="ms")
-                batch_encoded_sequence = tokenizer.batch_encode_plus_boxes(
-                    [words, words], [boxes, boxes], return_tensors="ms"
-                )
-                # This should not fail
-                model(**encoded_sequence)
-                model(**batch_encoded_sequence)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        words, boxes = self.get_words_and_boxes()
-
-        ids = tokenizer.encode_boxes(words, boxes=boxes, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode_boxes(words, boxes=boxes, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        ids = tokenizer.encode_boxes(words, boxes=boxes, add_special_tokens=True)
-        rust_ids = rust_tokenizer.encode_boxes(words, boxes=boxes, add_special_tokens=True)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_tokenization_python_rust_equals(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                words, boxes = self.get_words_and_boxes()
-
-                # Ensure basic input match
-                input_p = tokenizer_p.encode_plus_boxes(words, boxes=boxes)
-                input_r = tokenizer_r.encode_plus_boxes(words, boxes=boxes)
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_p[key], input_r[key])
-
-                input_pairs_p = tokenizer_p.encode_plus_boxes(words, boxes=boxes)
-                input_pairs_r = tokenizer_r.encode_plus_boxes(words, boxes=boxes)
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
-
-                words = ["hello" for _ in range(1000)]
-                boxes = [[1000, 1000, 1000, 1000] for _ in range(1000)]
-
-                # Ensure truncation match
-                input_p = tokenizer_p.encode_plus_boxes(words, boxes=boxes, max_length=512, truncation=True)
-                input_r = tokenizer_r.encode_plus_boxes(words, boxes=boxes, max_length=512, truncation=True)
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_p[key], input_r[key])
-
-                # Ensure truncation with stride match
-                input_p = tokenizer_p.encode_plus_boxes(
-                    words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
-                )
-                input_r = tokenizer_r.encode_plus_boxes(
-                    words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
-                )
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_p[key], input_r[key][0])
-
-    def test_embeded_special_tokens(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                words, boxes = self.get_words_and_boxes()
-                tokens_r = tokenizer_r.encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    add_special_tokens=True,
-                )
-                tokens_p = tokenizer_p.encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    add_special_tokens=True,
-                )
-
-                for key in tokens_p.keys():
-                    self.assertEqual(tokens_r[key], tokens_p[key])
-
-                if "token_type_ids" in tokens_r:
-                    self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
-
-                tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
-                tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
-                self.assertSequenceEqual(tokens_r, tokens_p)
-
-    def test_compare_add_special_tokens(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
-
-                words, boxes = self.get_words_and_boxes()
-                # tokenize()
-                no_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=False)
-                with_special_tokens = tokenizer_r.tokenize(" ".join(words), add_special_tokens=True)
-                self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
-
-                # encode()
-                no_special_tokens = tokenizer_r.encode_boxes(words, boxes=boxes, add_special_tokens=False)
-                with_special_tokens = tokenizer_r.encode_boxes(words, boxes=boxes, add_special_tokens=True)
-                self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
-
-                # encode_plus()
-                no_special_tokens = tokenizer_r.encode_plus_boxes(words, boxes=boxes, add_special_tokens=False)
-                with_special_tokens = tokenizer_r.encode_plus_boxes(words, boxes=boxes, add_special_tokens=True)
-                for key in no_special_tokens.keys():
-                    self.assertEqual(
-                        len(no_special_tokens[key]),
-                        len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
-                    )
-
-                # # batch_encode_plus
-                words, boxes = self.get_words_and_boxes_batch()
-
-                no_special_tokens = tokenizer_r.batch_encode_plus_boxes(words, boxes=boxes, add_special_tokens=False)
-                with_special_tokens = tokenizer_r.batch_encode_plus_boxes(words, boxes=boxes, add_special_tokens=True)
-                for key in no_special_tokens.keys():
-                    for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
-                        self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
-
-    @slow
-    def test_udop_truncation_integration_test(self):
-        words, boxes = self.get_words_and_boxes()
-
-        tokenizer = UdopTokenizer.from_pretrained("microsoft/udop-large", model_max_length=512)
-
-        for i in range(12, 512):
-            new_encoded_inputs = tokenizer.encode_boxes(words, boxes=boxes, max_length=i, truncation=True)
-
-            # Ensure that the input IDs are less than the max length defined.
-            self.assertLessEqual(len(new_encoded_inputs), i)
-
-        tokenizer.model_max_length = 20
-        new_encoded_inputs = tokenizer.encode_boxes(words, boxes=boxes, truncation=True)
-        dropped_encoded_inputs = tokenizer.encode_boxes(words, boxes=boxes, truncation=True)
-
-        # Ensure that the input IDs are still truncated when no max_length is specified
-        self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs)
-        self.assertLessEqual(len(new_encoded_inputs), 20)
-
-    def test_batch_encode_plus_tensors(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes_batch()
-
-                # A Tensor cannot be build by sequences which are not the same size
-                self.assertRaises(
-                    ValueError, tokenizer.batch_encode_plus_boxes, words, boxes=boxes, return_tensors="ms"
-                )
-                self.assertRaises(
-                    ValueError, tokenizer.batch_encode_plus_boxes, words, boxes=boxes, return_tensors="ms"
-                )
-
-                if tokenizer.pad_token_id is None:
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus_boxes,
-                        words,
-                        boxes=boxes,
-                        padding=True,
-                        return_tensors="ms",
-                    )
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus_boxes,
-                        words,
-                        boxes=boxes,
-                        padding="longest",
-                        return_tensors="ms",
-                    )
-                else:
-                    pytorch_tensor = tokenizer.batch_encode_plus_boxes(
-                        words, boxes=boxes, padding=True, return_tensors="ms"
-                    )
-                    tensorflow_tensor = tokenizer.batch_encode_plus_boxes(
-                        words, boxes=boxes, padding="longest", return_tensors="ms"
-                    )
-                    encoded_sequences = tokenizer.batch_encode_plus_boxes(words, boxes=boxes, padding=True)
-
-                    for key in encoded_sequences.keys():
-                        pytorch_value = pytorch_tensor[key].tolist()
-                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
-                        encoded_value = encoded_sequences[key]
-
-                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
-
-    def test_sequence_ids(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            if not tokenizer.is_fast:
-                continue
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                seq_0 = "Test this method."
-                seq_1 = ["With", "these", "inputs."]
-                boxes = [[1000, 1000, 1000, 1000] for _ in range(len(seq_1))]
-
-                # We want to have sequence 0 and sequence 1 are tagged
-                # respectively with 0 and 1 token_ids
-                # (regardless of whether the model use token type ids)
-                # We use this assumption in the QA pipeline among other place
-                output = tokenizer(seq_0.split(), boxes=boxes)
-                self.assertIn(0, output.sequence_ids())
-
-                output = tokenizer(seq_0, seq_1, boxes=boxes)
-                self.assertIn(0, output.sequence_ids())
-                self.assertIn(1, output.sequence_ids())
-
-                if tokenizer.num_special_tokens_to_add(pair=True):
-                    self.assertIn(None, output.sequence_ids())
-
-    def test_special_tokens_initialization(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                added_tokens = [AddedToken("<special>", lstrip=True)]
-
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                )
-                words = "Hey this is a <special> token".split()
-                boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
-                r_output = tokenizer_r.encode_boxes(words, boxes=boxes)
-
-                special_token_id = tokenizer_r.encode_boxes(
-                    ["<special>"], boxes=[1000, 1000, 1000, 1000], add_special_tokens=False
-                )[0]
-
-                self.assertTrue(special_token_id in r_output)
-
-                if self.test_slow_tokenizer:
-                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
-                    )
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                    )
-
-                    words = "Hey this is a <special> token".split()
-                    boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
-
-                    p_output = tokenizer_p.encode_boxes(words, boxes=boxes)
-                    cr_output = tokenizer_cr.encode_boxes(words, boxes=boxes)
-
-                    self.assertEqual(p_output, r_output)
-                    self.assertEqual(cr_output, r_output)
-                    self.assertTrue(special_token_id in p_output)
-                    self.assertTrue(special_token_id in cr_output)
-
-    def test_training_new_tokenizer(self):
-        # This feature only exists for fast tokenizers
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer = self.get_rust_tokenizer()
-        new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
-
-        # Test we can use the new tokenizer with something not seen during training
-        text = [["this", "is", "the"], ["how", "are", "you"]]
-        boxes = [[[1, 2, 3, 4], [5, 6, 7, 8], [1, 3, 4, 8]], [[5, 6, 7, 8], [4, 5, 6, 7], [3, 9, 2, 7]]]
-        inputs = new_tokenizer(text, boxes=boxes)
-        self.assertEqual(len(inputs["input_ids"]), 2)
-        decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
-        expected_result = "this is the"
-
-        if tokenizer.backend_tokenizer.normalizer is not None:
-            expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
-        self.assertEqual(expected_result, decoded_input)
-
-        # We check that the parameters of the tokenizer remained the same
-        # Check we have the same number of added_tokens for both pair and non-pair inputs.
-        self.assertEqual(tokenizer.num_special_tokens_to_add(False), new_tokenizer.num_special_tokens_to_add(False))
-        self.assertEqual(tokenizer.num_special_tokens_to_add(True), new_tokenizer.num_special_tokens_to_add(True))
-
-        # Check we have the correct max_length for both pair and non-pair inputs.
-        self.assertEqual(tokenizer.max_len_single_sentence, new_tokenizer.max_len_single_sentence)
-        self.assertEqual(tokenizer.max_len_sentences_pair, new_tokenizer.max_len_sentences_pair)
-
-        # Assert the set of special tokens match as we didn't ask to change them
-        self.assertSequenceEqual(
-            tokenizer.all_special_tokens_extended,
-            new_tokenizer.all_special_tokens_extended,
-        )
-
-        self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map)
-
-    def test_training_new_tokenizer_with_special_tokens_change(self):
-        # This feature only exists for fast tokenizers
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer = self.get_rust_tokenizer()
-        # Test with a special tokens map
-        class_signature = inspect.signature(tokenizer.__class__)
-        if "cls_token" in class_signature.parameters:
-            new_tokenizer = tokenizer.train_new_from_iterator(
-                SMALL_TRAINING_CORPUS, 100, special_tokens_map={tokenizer.cls_token: "<cls>"}
-            )
-            cls_id = new_tokenizer.get_vocab()["<cls>"]
-            self.assertEqual(new_tokenizer.cls_token, "<cls>")
-            self.assertEqual(new_tokenizer.cls_token_id, cls_id)
-
-        # Create a new mapping from the special tokens defined in the original tokenizer
-        special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy()
-        special_tokens_list.remove("additional_special_tokens")
-        special_tokens_map = {}
-        for token in special_tokens_list:
-            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
-                special_token = getattr(tokenizer, token)
-                special_tokens_map[special_token] = f"{special_token}a"
-
-        # Train new tokenizer
-        new_tokenizer = tokenizer.train_new_from_iterator(
-            SMALL_TRAINING_CORPUS, 100, special_tokens_map=special_tokens_map
-        )
-
-        # Check the changes
-        for token in special_tokens_list:
-            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
-                continue
-            special_token = getattr(tokenizer, token)
-            if special_token in special_tokens_map:
-                new_special_token = getattr(new_tokenizer, token)
-                self.assertEqual(special_tokens_map[special_token], new_special_token)
-
-                new_id = new_tokenizer.get_vocab()[new_special_token]
-                self.assertEqual(getattr(new_tokenizer, f"{token}_id"), new_id)
-
-        # Check if the AddedToken / string format has been kept
-        for special_token in tokenizer.all_special_tokens_extended:
-            if isinstance(special_token, AddedToken) and special_token.content not in special_tokens_map:
-                # The special token must appear identically in the list of the new tokenizer.
-                self.assertTrue(
-                    special_token in new_tokenizer.all_special_tokens_extended,
-                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
-                )
-            elif isinstance(special_token, AddedToken):
-                # The special token must appear in the list of the new tokenizer as an object of type AddedToken with
-                # the same parameters as the old AddedToken except the content that the user has requested to change.
-                special_token_str = special_token.content
-                new_special_token_str = special_tokens_map[special_token_str]
-
-                find = False
-                for candidate in new_tokenizer.all_special_tokens_extended:
-                    if (
-                        isinstance(candidate, AddedToken)
-                        and candidate.content == new_special_token_str
-                        and candidate.lstrip == special_token.lstrip
-                        and candidate.rstrip == special_token.rstrip
-                        and candidate.normalized == special_token.normalized
-                        and candidate.single_word == special_token.single_word
-                    ):
-                        find = True
-                        break
-                self.assertTrue(
-                    find,
-                    f"'{new_special_token_str}' doesn't appear in the list "
-                    f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as "
-                    f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}",
-                )
-            elif special_token not in special_tokens_map:
-                # The special token must appear identically in the list of the new tokenizer.
-                self.assertTrue(
-                    special_token in new_tokenizer.all_special_tokens_extended,
-                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
-                )
-
-            else:
-                # The special token must appear in the list of the new tokenizer as an object of type string.
-                self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens_extended)
-
-        # Test we can use the new tokenizer with something not seen during training
-        words = [["this", "is"], ["hello", "🤗"]]
-        boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[1, 2, 3, 4], [5, 6, 7, 8]]]
-        inputs = new_tokenizer(words, boxes=boxes)
-        self.assertEqual(len(inputs["input_ids"]), 2)
-        decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
-        expected_result = "this is"
-
-        if tokenizer.backend_tokenizer.normalizer is not None:
-            expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
-        self.assertEqual(expected_result, decoded_input)
-
-    def test_prepare_for_model(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            # only test prepare_for_model for the slow tokenizer
-            if tokenizer.__class__.__name__ == "UdopTokenizerFast":
-                continue
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes()
-                prepared_input_dict = tokenizer.prepare_for_model_boxes(words, boxes=boxes, add_special_tokens=True)
-
-                input_dict = tokenizer.encode_plus_boxes(words, boxes=boxes, add_special_tokens=True)
-
-                self.assertEqual(input_dict, prepared_input_dict)
-
-    def test_padding_different_model_input_name(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
-                pad_token_id = tokenizer_p.pad_token_id
-
-                words, boxes = self.get_words_and_boxes_batch()
-
-                input_r = tokenizer_r.batch_encode_plus_boxes(words, boxes=boxes)
-                input_p = tokenizer_r.batch_encode_plus_boxes(words, boxes=boxes)
-
-                # rename encoded batch to "inputs"
-                input_r["inputs"] = input_r[tokenizer_r.model_input_names[0]]
-                del input_r[tokenizer_r.model_input_names[0]]
-
-                input_p["inputs"] = input_p[tokenizer_p.model_input_names[0]]
-                del input_p[tokenizer_p.model_input_names[0]]
-
-                # Renaming `input_ids` to `inputs`
-                tokenizer_r.model_input_names = ["inputs"] + tokenizer_r.model_input_names[1:]
-                tokenizer_p.model_input_names = ["inputs"] + tokenizer_p.model_input_names[1:]
-
-                input_r = tokenizer_r.pad(input_r, padding="longest")
-                input_p = tokenizer_r.pad(input_p, padding="longest")
-
-                max_length = len(input_p["inputs"][0])
-                self.assert_batch_padded_input_match(
-                    input_r, input_p, max_length, pad_token_id, model_main_input_name="inputs"
-                )
-
-    def test_batch_encode_dynamic_overflowing(self):
-        """
-        When calling batch_encode with multiple sequences, it can return different number of
-        overflowing encoding for each sequence:
-        [
-          Sequence 1: [Encoding 1, Encoding 2],
-          Sequence 2: [Encoding 1],
-          Sequence 3: [Encoding 1, Encoding 2, ... Encoding N]
-        ]
-        This needs to be padded so that it can represented as a tensor
-        """
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
-                if is_mindspore_available():
-                    returned_tensor = "ms"
-                else:
-                    returned_tensor = "jax"
-
-                # Single example
-                words, boxes = self.get_words_and_boxes()
-                tokens = tokenizer.encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    max_length=6,
-                    padding=True,
-                    truncation=True,
-                    return_tensors=returned_tensor,
-                    return_overflowing_tokens=True,
-                )
-
-                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
-                    if key != "bbox":
-                        self.assertEqual(len(tokens[key].shape), 2)
-                    else:
-                        self.assertEqual(len(tokens[key].shape), 3)
-
-                # Batch of examples
-                # For these 2 examples, 3 training examples will be created
-                words, boxes = self.get_words_and_boxes_batch()
-                tokens = tokenizer.batch_encode_plus_boxes(
-                    words,
-                    boxes=boxes,
-                    max_length=6,
-                    padding=True,
-                    truncation="only_first",
-                    return_tensors=returned_tensor,
-                    return_overflowing_tokens=True,
-                )
-
-                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
-                    if key != "bbox":
-                        self.assertEqual(len(tokens[key].shape), 2)
-                        self.assertEqual(tokens[key].shape[-1], 6)
-                    else:
-                        self.assertEqual(len(tokens[key].shape), 3)
-                        self.assertEqual(tokens[key].shape[-1], 4)
-
-    @unittest.skip(reason="TO DO: overwrite this very extensive test.")
-    def test_alignement_methods(self):
-        pass
-
-    @unittest.skip(reason="UDOP tokenizer requires boxes besides sequences.")
-    def test_maximum_encoding_length_pair_input(self):
-        pass
-
-    @unittest.skip(reason="UDOP tokenizer requires boxes besides sequences.")
-    def test_maximum_encoding_length_single_input(self):
-        pass
-
-    @unittest.skip(reason="UDOP tokenizer requires boxes besides sequences.")
-    def test_pretokenized_inputs(self):
-        pass
-
-    @unittest.skip(reason="UDOP tokenizer always expects pretokenized inputs.")
-    def test_compare_pretokenized_inputs(self):
-        pass
-
-    @unittest.skip(reason="UDOP fast tokenizer does not support prepare_for_model")
-    def test_compare_prepare_for_model(self):
-        pass
-
-    @slow
-    def test_only_label_first_subword(self):
-        words = ["hello", "niels"]
-        boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
-        word_labels = [0, 1]
-
-        # test slow tokenizer
-        tokenizer_p = UdopTokenizer.from_pretrained("microsoft/udop-large")
-        encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [0, 1, -100, -100, -100])
-
-        tokenizer_p = UdopTokenizer.from_pretrained("microsoft/udop-large", only_label_first_subword=False)
-        encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [0, 1, 1, 1, -100])
-
-        # test fast tokenizer
-        tokenizer_r = UdopTokenizerFast.from_pretrained("microsoft/udop-large")
-        encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [0, 1, -100, -100, -100])
-
-        tokenizer_r = UdopTokenizerFast.from_pretrained("microsoft/udop-large", only_label_first_subword=False)
-        encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [0, 1, 1, 1, -100])
-
-    @slow
-    def test_udop_integration_test(self):
-        tokenizer_p = UdopTokenizer.from_pretrained("microsoft/udop-large")
-        tokenizer_r = UdopTokenizerFast.from_pretrained("microsoft/udop-large")
-
-        # There are 3 cases:
-        # CASE 1: document image classification (training + inference), document image token classification (inference),
-        # in which case only words and normalized bounding boxes are provided to the tokenizer
-        # CASE 2: document image token classification (training),
-        # in which case one also provides word labels to the tokenizer
-        # CASE 3: document image visual question answering (inference),
-        # in which case one also provides a question to the tokenizer
-
-        # We need to test all 3 cases both on batched and non-batched inputs.
-
-        # CASE 1: not batched
-        words, boxes = self.get_words_and_boxes()
-
-        # fmt: off
-        expected_results = {'input_ids': [3, 9, 10088, 120, 794, 21820, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'bbox': [[423, 237, 440, 251], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [961, 885, 992, 912], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}  # noqa: E231
-        # fmt: on
-
-        encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20)
-        encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20)
-        self.assertDictEqual(dict(encoding_p), expected_results)
-        self.assertDictEqual(dict(encoding_r), expected_results)
-
-        # CASE 1: batched
-        words, boxes = self.get_words_and_boxes_batch()
-
-        # fmt: off
-        expected_results = {'input_ids': [[3, 9, 10088, 120, 794, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [21820, 82, 564, 19, 3, 17396, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'bbox': [[[423, 237, 440, 251], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E231
-        # fmt: on
-
-        encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20)
-        encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20)
-        self.assertDictEqual(dict(encoding_p), expected_results)
-        self.assertDictEqual(dict(encoding_r), expected_results)
-
-        # CASE 2: not batched
-        words, boxes = self.get_words_and_boxes()
-        word_labels = [1, 2, 3, 4]
-
-        # fmt: off
-        expected_results = {'input_ids': [3, 9, 10088, 120, 794, 21820, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'bbox': [[423, 237, 440, 251], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [961, 885, 992, 912], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'labels': [1, -100, 2, -100, 3, 4, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}  # noqa: E231
-        # fmt: on
-
-        encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
-        encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
-
-        for key in expected_results:
-            self.assertListEqual(encoding_p[key], encoding_r[key])
-
-        self.assertDictEqual(dict(encoding_p), expected_results)
-        self.assertDictEqual(dict(encoding_r), expected_results)
-
-        # CASE 2: batched
-        words, boxes = self.get_words_and_boxes_batch()
-        word_labels = [[1, 2, 3], [2, 46, 17, 22, 3]]
-
-        # fmt: off
-        expected_results = {'input_ids': [[3, 9, 10088, 120, 794, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [21820, 82, 564, 19, 3, 17396, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'bbox': [[[423, 237, 440, 251], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'labels': [[1, -100, 2, -100, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], [2, 46, 17, 22, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E231
-        # fmt: on
-
-        encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
-        encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
-        self.assertDictEqual(dict(encoding_p), expected_results)
-        self.assertDictEqual(dict(encoding_r), expected_results)
-
-        # CASE 3: not batched
-        question, words, boxes = self.get_question_words_and_boxes()
-
-        # fmt: off
-        expected_results = {'input_ids': [125, 31, 7, 112, 564, 58, 1, 3, 9, 10088, 120, 794, 1, 0, 0, 0, 0, 0, 0, 0], 'bbox': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [423, 237, 440, 251], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}  # noqa: E231
-        # fmt: on
-
-        encoding_p = tokenizer_p(question, words, boxes, padding="max_length", max_length=20)
-        encoding_r = tokenizer_r(question, words, boxes, padding="max_length", max_length=20)
-        self.assertDictEqual(dict(encoding_p), expected_results)
-        self.assertDictEqual(dict(encoding_r), expected_results)
-
-        # CASE 3: batched
-        questions, words, boxes = self.get_question_words_and_boxes_batch()
-
-        # fmt: off
-        expected_results = {'input_ids': [[125, 31, 7, 112, 564, 58, 1, 3, 9, 10088, 120, 794, 1, 0, 0, 0, 0, 0, 0, 0], [149, 19, 3, 88, 718, 58, 1, 125, 3, 9, 50, 99, 1807, 17, 29, 1, 0, 0, 0, 0]], 'bbox': [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [423, 237, 440, 251], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [256, 38, 330, 58], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [336, 42, 353, 57], [34, 42, 66, 69], [34, 42, 66, 69], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]}  # noqa: E231
-        # fmt: on
-
-        encoding_p = tokenizer_p(questions, words, boxes, padding="max_length", max_length=20)
-        encoding_r = tokenizer_r(questions, words, boxes, padding="max_length", max_length=20)
-        self.assertDictEqual(dict(encoding_p), expected_results)
-        self.assertDictEqual(dict(encoding_r), expected_results)
-
-    @unittest.skip(reason="Doesn't support another framework than PyTorch")
-    def test_np_encode_plus_sent_to_model(self):
-        pass
-
-    @unittest.skip(reason="Doesn't use SentencePiece")
-    def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
-        pass
-
-    @unittest.skip(reason="Doesn't use SentencePiece")
-    def test_sentencepiece_tokenize_and_decode(self):
-        pass
-
-    def test_text_target(self):
-        tokenizer_p = UdopTokenizer.from_pretrained("microsoft/udop-large")
-        tokenizer_r = UdopTokenizerFast.from_pretrained("microsoft/udop-large")
-
-        text = "hello world"
-        expected_decoding = "hello world</s>"
-
-        # should raise an error if we don't provide it using the `text_target` argument
-        with self.assertRaises(ValueError):
-            tokenizer_p(text)
-
-        encoding_p = tokenizer_p(text_target=text)
-        encoding_r = tokenizer_r(text_target=text)
-
-        self.assertListEqual(encoding_p["input_ids"], [21820, 296, 1])
-        self.assertListEqual(encoding_p["attention_mask"], [1, 1, 1])
-        self.assertDictEqual(dict(encoding_p), dict(encoding_r))
-        self.assertEqual(tokenizer_p.decode(encoding_p["input_ids"]), expected_decoding)
-
-    def test_special_tokens(self):
-        tokenizer_p = UdopTokenizer.from_pretrained("microsoft/udop-large")
-        tokenizer_r = UdopTokenizerFast.from_pretrained("microsoft/udop-large")
-
-        # encode
-        text = "paragraph<loc_58>. Hey"
-        encoding_p = tokenizer_p.encode(text)
-        encoding_r = tokenizer_r.encode(text)
-
-        assert encoding_p == encoding_r == [8986, 32942, 3, 5, 9459, 1]
-
-        # decode
-        # this is different between slow/fast tokenizer
-        # due tothe former having  `spaces_between_special_tokens=True` by default
-        ids = [0, 8986, 32942, 32966, 32554, 32551, 1]
-
-        # test slow tokenizer
-        decoding = tokenizer_p.decode(ids, spaces_between_special_tokens=False)
-
-        excepted_decoding = "<pad>paragraph<loc_58><loc_34><loc_446><loc_449></s>"
-        assert decoding == excepted_decoding
-
-        # test fast tokenizer
-        decoding = tokenizer_r.decode(ids)
-
-        excepted_decoding = "<pad> paragraph<loc_58><loc_34><loc_446><loc_449></s>"
-        assert decoding == excepted_decoding
-
-    def test_split_special_tokens(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            special_token = "<my_new_token>"
-            special_sentence = "Hey this is a <my_new_token> token"
-            _, _, boxes = self.get_question_words_and_boxes()
-
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_rust = self.rust_tokenizer_class.from_pretrained(
-                    pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
-                )
-                tokenizer_py = self.tokenizer_class.from_pretrained(
-                    pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
-                )
-
-                special_token_id = tokenizer_py.convert_tokens_to_ids(special_token)
-                encoded_special_token_unsplit = tokenizer_py.encode(
-                    special_token, add_special_tokens=False, split_special_tokens=False
-                )
-                self.assertTrue(special_token_id in encoded_special_token_unsplit)
-
-                encoded_special_token_split = tokenizer_py.encode(special_token, add_special_tokens=False)
-                self.assertTrue(special_token_id not in encoded_special_token_split)
-
-                py_tokens_output = tokenizer_py.tokenize(special_sentence)
-                rust_tokens_output = tokenizer_rust.tokenize(special_sentence,split_special_tokens=True)
-                self.assertTrue(special_token not in py_tokens_output)
-                self.assertTrue(special_token not in rust_tokens_output)
-
-                py_tokens_output_unsplit = tokenizer_py.tokenize(special_sentence, split_special_tokens=False)
-                rust_tokens_output_unsplit = tokenizer_rust.tokenize(special_sentence, split_special_tokens=False)
-
-                self.assertTrue(special_token in py_tokens_output_unsplit)
-                self.assertTrue(special_token in rust_tokens_output_unsplit)
-
-                tmpdirname = tempfile.mkdtemp()
-                tokenizer_py.save_pretrained(tmpdirname)
-                fast_from_saved = self.tokenizer_class.from_pretrained(tmpdirname)
-
-                output_tokens_reloaded_split = fast_from_saved.tokenize(special_sentence)
-                self.assertTrue(special_token not in output_tokens_reloaded_split)
-
-                output_tokens_reloaded_unsplit = fast_from_saved.tokenize(special_sentence, split_special_tokens=False)
-                self.assertTrue(special_token in output_tokens_reloaded_unsplit)
diff --git a/tests/transformers/models/umt5/__init__.py b/tests/transformers/models/umt5/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/umt5/test_modeling_umt5.py b/tests/transformers/models/umt5/test_modeling_umt5.py
deleted file mode 100644
index eb1f3b41e..000000000
--- a/tests/transformers/models/umt5/test_modeling_umt5.py
+++ /dev/null
@@ -1,626 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import tempfile
-import unittest
-
-from mindnlp.transformers import UMT5Config
-from mindnlp.transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-from mindnlp.utils import is_mindspore_available
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import (
-        AutoTokenizer,
-        UMT5EncoderModel,
-        UMT5ForConditionalGeneration,
-        UMT5ForQuestionAnswering,
-        UMT5ForSequenceClassification,
-        UMT5ForTokenClassification,
-        UMT5Model,
-    )
-
-
-# Copied from test.models.t5.test_modeling_t5.T5ModelTester with T5->UMT5
-class UMT5ModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        decoder_seq_length=7,
-        # For common tests
-        is_training=True,
-        use_attention_mask=True,
-        use_labels=False,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        eos_token_id=1,
-        pad_token_id=0,
-        decoder_start_token_id=0,
-        scope=None,
-        decoder_layers=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        self.decoder_seq_length = decoder_seq_length
-        # For common tests
-        self.seq_length = self.decoder_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.scope = None
-        self.decoder_layers = decoder_layers
-
-    def get_large_model_config(self):
-        return UMT5Config.from_pretrained("google/umt5-base")
-
-    def prepare_inputs_dict(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-    ):
-        if attention_mask is None:
-            attention_mask = input_ids.ne(config.pad_token_id)
-        if decoder_attention_mask is None:
-            decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-        if head_mask is None:
-            head_mask = ops.ones(config.num_hidden_layers, config.num_attention_heads)
-        if decoder_head_mask is None:
-            decoder_head_mask = ops.ones(config.num_decoder_layers, config.num_attention_heads)
-        if cross_attn_head_mask is None:
-            cross_attn_head_mask = ops.ones(
-                config.num_decoder_layers, config.num_attention_heads
-            )
-        return {
-            "input_ids": input_ids,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "decoder_attention_mask": decoder_attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-        }
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-        decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
-
-        # we need to clamp the input ids here to avoid having pad token in between
-        # this is because for NllbMoe the position_ids are prepared such that
-        # all pad tokens have pos id = 2 and rest are between 2..seq_length
-        # and the seq_length here is seq_length - num_pad_tokens
-        # but when using past, there is no way of knowing if the past input ids had
-        # pad tokens in them, which results in incorrect seq_lenth and which in turn results in
-        # position_ids being off by num_pad_tokens in past input
-        input_ids = input_ids.clamp(self.pad_token_id + 2)
-        input_ids[:, -1] = self.eos_token_id  # Eos Token
-        decoder_input_ids = decoder_input_ids.clamp(self.pad_token_id + 1)
-
-        config = self.get_config()
-        config.encoder_attention_heads = config.num_attention_heads
-        input_dict = self.prepare_inputs_dict(config, input_ids, decoder_input_ids)
-        return config, input_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_pipeline_config(self):
-        return UMT5Config(
-            vocab_size=166,  # t5 forces 100 extra tokens
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-        )
-
-    def get_config(self):
-        return UMT5Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_decoder_layers=self.decoder_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_start_token_id=self.decoder_start_token_id,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = UMT5Model(config=config)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-        result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        decoder_output = result.last_hidden_state
-        decoder_past = result.past_key_values
-        encoder_output = result.encoder_last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-        self.parent.assertEqual(decoder_output.shape, (self.batch_size, self.decoder_seq_length, self.hidden_size))
-        # There should be `num_layers` key value embeddings stored in decoder_past
-        self.parent.assertEqual(len(decoder_past), config.num_layers)
-        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
-        self.parent.assertEqual(len(decoder_past[0]), 4)
-
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = UMT5Model(config=config).get_decoder().eval()
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_model_fp16_forward(
-        self,
-        config,
-        input_dict,
-    ):
-        model = UMT5Model(config=config).half().eval()
-        output = model(**input_dict)["last_hidden_state"]
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-    def create_and_check_with_sequence_classification_head(
-        self,
-        config,
-        input_dict,
-    ):
-        labels = mindspore.tensor([1] * self.batch_size, dtype=mindspore.int64)
-        model = UMT5ForSequenceClassification(config=config).eval()
-        outputs = model(**input_dict, labels=labels)
-        # self.parent.assertEqual(len(outputs), 4)
-        self.parent.assertEqual(outputs["logits"].shape, (self.batch_size, config.num_labels))
-        self.parent.assertEqual(outputs["loss"].shape, ())
-
-
-@require_mindspore
-class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (UMT5Model, UMT5ForConditionalGeneration, UMT5ForSequenceClassification, UMT5ForQuestionAnswering)
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (UMT5ForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": UMT5Model,
-            "question-answering": UMT5ForQuestionAnswering,
-            "summarization": UMT5ForConditionalGeneration,
-            "text-classification": UMT5ForSequenceClassification,
-            "text2text-generation": UMT5ForConditionalGeneration,
-            "translation": UMT5ForConditionalGeneration,
-            "zero-shot": UMT5ForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    fx_compatible = False
-    test_pruning = False
-    test_missing_keys = True
-    # The small UMT5 model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    def setUp(self):
-        self.model_tester = UMT5ModelTester(self)
-
-    # `QAPipelineTests` is not working well with slow tokenizers (for some models) and we don't want to touch the file
-    # `src/transformers/data/processors/squad.py` (where this test fails for this model)
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
-            return True
-
-        return False
-
-    # UMT5ForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (UMT5Model, UMT5ForConditionalGeneration, UMT5ForQuestionAnswering):
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with no_grad():
-                model(**inputs)[0]
-
-    def test_with_sequence_classification_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_sequence_classification_head(*config_and_inputs)
-
-    def test_model_fp16_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
-
-    def test_generate_with_head_masking(self):
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config = config_and_inputs[0]
-        model = UMT5ForConditionalGeneration(config).eval()
-
-        head_masking = {
-            "head_mask": ops.zeros(config.num_layers, config.num_heads),
-            "decoder_head_mask": ops.zeros(config.num_decoder_layers, config.num_heads),
-            "cross_attn_head_mask": ops.zeros(config.num_decoder_layers, config.num_heads),
-        }
-
-        for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
-            head_masks = {name: mask}
-            # Explicitly pass decoder_head_mask as it is required from T5 model when head_mask specified
-            if name == "head_mask":
-                head_masks["decoder_head_mask"] = ops.ones(
-                    config.num_decoder_layers, config.num_heads
-                )
-
-            out = model.generate(
-                config_and_inputs[1]["input_ids"],
-                num_beams=1,
-                max_length=3,
-                output_attentions=True,
-                return_dict_in_generate=True,
-                **head_masks,
-            )
-            # We check the state of decoder_attentions and cross_attentions just from the last step
-            attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
-            self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-
-# Copied from tests.models.t5.test_modeling_t5.T5EncoderOnlyModelTester with T5->UMT5
-class UMT5EncoderOnlyModelTester:
-    def __init__(
-        self,
-        parent,
-        vocab_size=99,
-        batch_size=13,
-        encoder_seq_length=7,
-        # For common tests
-        use_attention_mask=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        d_ff=37,
-        relative_attention_num_buckets=8,
-        is_training=False,
-        dropout_rate=0.1,
-        initializer_factor=0.002,
-        is_encoder_decoder=False,
-        eos_token_id=1,
-        pad_token_id=0,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.encoder_seq_length = encoder_seq_length
-        # For common tests
-        self.seq_length = self.encoder_seq_length
-        self.use_attention_mask = use_attention_mask
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.d_ff = d_ff
-        self.relative_attention_num_buckets = relative_attention_num_buckets
-        self.dropout_rate = dropout_rate
-        self.initializer_factor = initializer_factor
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.is_encoder_decoder = is_encoder_decoder
-        self.scope = None
-        self.is_training = is_training
-
-    def get_large_model_config(self):
-        return UMT5Config.from_pretrained("google-t5/t5-base")
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
-
-        config = UMT5Config(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            d_ff=self.d_ff,
-            d_kv=self.hidden_size // self.num_attention_heads,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_attention_heads,
-            relative_attention_num_buckets=self.relative_attention_num_buckets,
-            dropout_rate=self.dropout_rate,
-            initializer_factor=self.initializer_factor,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.pad_token_id,
-            pad_token_id=self.pad_token_id,
-            is_encoder_decoder=self.is_encoder_decoder,
-        )
-
-        return (
-            config,
-            input_ids,
-            attention_mask,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-    ):
-        model = UMT5EncoderModel(config=config)
-        model.eval()
-        result = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-        )
-        result = model(input_ids=input_ids)
-        encoder_output = result.last_hidden_state
-
-        self.parent.assertEqual(encoder_output.shape, (self.batch_size, self.encoder_seq_length, self.hidden_size))
-
-    def create_and_check_model_fp16_forward(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-    ):
-        model = UMT5EncoderModel(config=config).half().eval()
-        output = model(input_ids, attention_mask=attention_mask)["last_hidden_state"]
-        self.parent.assertFalse(ops.isnan(output).any().item())
-
-    def create_and_check_with_token_classification_head(
-        self,
-        config,
-        input_ids,
-        attention_mask,
-    ):
-        labels = mindspore.tensor([1] * self.seq_length * self.batch_size, dtype=mindspore.int64)
-        model = UMT5ForTokenClassification(config=config).eval()
-        outputs = model(
-            input_ids=input_ids,
-            labels=labels,
-            attention_mask=attention_mask,
-        )
-        self.parent.assertEqual(outputs["logits"].shape, (self.batch_size, self.seq_length, config.num_labels))
-        self.parent.assertEqual(outputs["loss"].shape, ())
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            attention_mask,
-        ) = config_and_inputs
-
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-# Copied from tests.models.t5.test_modeling_t5.T5EncoderOnlyModelTest with T5->UMT5
-class UMT5EncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (UMT5EncoderModel, UMT5ForTokenClassification) if is_mindspore_available() else ()
-    test_pruning = False
-    test_resize_embeddings = False
-    test_model_parallel = True
-    pipeline_model_mapping = (
-        {
-            "token-classification": UMT5ForTokenClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    all_parallelizable_model_classes = (UMT5EncoderModel,) if is_mindspore_available() else ()
-
-    def setUp(self):
-        self.model_tester = UMT5EncoderOnlyModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=UMT5Config, d_model=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_fp16_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
-
-    def test_with_token_classification_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_with_token_classification_head(*config_and_inputs)
-
-
-@require_mindspore
-@require_sentencepiece
-@require_tokenizers
-class Umt5IntegrationTest(unittest.TestCase):
-    @slow
-    @unittest.skip(
-        "Unless we stop stripping left and right by default for all special tokens, the expected ids obtained here will not match the original ones. Wait for https://github.com/huggingface/transformers/pull/23909 to be merged"
-    )
-    def test_small_integration_test(self):
-        """
-        For comparison run the kaggle notbook available here : https://www.kaggle.com/arthurzucker/umt5-inference
-        """
-
-        model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small", return_dict=True)
-        tokenizer = AutoTokenizer.from_pretrained("google/umt5-small", use_fast=False, legacy=False)
-        input_text = [
-            "Bonjour monsieur <extra_id_0> bien <extra_id_1>.",
-            "No se como puedo <extra_id_0>.",
-            "This is the reason why we <extra_id_0> them.",
-            "The <extra_id_0> walks in <extra_id_1>, seats",
-            "A <extra_id_0> walks into a bar and orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>.",
-        ]
-        input_ids = tokenizer(input_text, return_tensors="ms", padding=True).input_ids
-        # fmt: off
-        EXPECTED_IDS = mindspore.tensor(
-            [
-                [ 38530, 210703, 256299, 1410, 256298, 274, 1, 0,0, 0, 0, 0, 0, 0, 0, 0,0, 0],
-                [   826, 321, 671, 25922, 256299, 274, 1, 0,0, 0, 0, 0, 0, 0, 0, 0,0, 0],
-                [  1460, 339, 312, 19014, 10620, 758, 256299, 2355,274, 1, 0, 0, 0, 0, 0, 0,0, 0],
-                [   517, 256299, 14869, 281, 301, 256298, 275, 119983,1, 0, 0, 0, 0, 0, 0, 0,0, 0],
-                [   320, 256299, 14869, 281, 2234, 289, 2275, 333,61391, 289, 256298, 543, 256297, 168714, 329, 256296,274, 1],
-            ]
-        )
-        # fmt: on
-        assert ops.allclose(input_ids, EXPECTED_IDS)
-
-        generated_ids = model.generate(input_ids)
-        EXPECTED_FILLING = [
-            "<pad><extra_id_0> et<extra_id_1> [eod] <extra_id_2><extra_id_55>.. [eod] 💐 💐 💐 💐 💐 💐 💐 💐 💐 💐 💐 <extra_id_56>ajšietosto<extra_id_56>lleux<extra_id_19><extra_id_6>ajšie</s>",
-            "<pad><extra_id_0>.<extra_id_1>.,<0x0A>...spech <0x0A><extra_id_20> <extra_id_21></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
-            "<pad><extra_id_0> are not going to be a part of the world. We are not going to be a part of<extra_id_1> and<extra_id_2><0x0A><extra_id_48>.<extra_id_48></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
-            "<pad><extra_id_0> door<extra_id_1>, the door<extra_id_2> 피해[/</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
-            "<pad><extra_id_0>nyone who<extra_id_1> drink<extra_id_2> a<extra_id_3> alcohol<extra_id_4> A<extra_id_5> A. This<extra_id_6> I<extra_id_7><extra_id_52><extra_id_53></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
-        ]
-        filling = tokenizer.batch_decode(generated_ids)
-        self.assertEqual(filling, EXPECTED_FILLING)
\ No newline at end of file
diff --git a/tests/transformers/models/unispeech/__init__.py b/tests/transformers/models/unispeech/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/unispeech/test_modeling_unispeech.py b/tests/transformers/models/unispeech/test_modeling_unispeech.py
deleted file mode 100644
index 88e4307b6..000000000
--- a/tests/transformers/models/unispeech/test_modeling_unispeech.py
+++ /dev/null
@@ -1,585 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch UniSpeech model."""
-
-import math
-import unittest
-import numpy as np
-import pytest
-from datasets import load_dataset
-from mindspore import set_seed
-
-from mindnlp.transformers import UniSpeechConfig
-from mindnlp.utils.testing_utils import require_soundfile, require_mindspore, slow, is_mindspore_available
-from mindnlp.core.nn import functional as F
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import (
-        UniSpeechForCTC,
-        UniSpeechForPreTraining,
-        UniSpeechForSequenceClassification,
-        UniSpeechModel,
-        Wav2Vec2FeatureExtractor,
-        Wav2Vec2Processor,
-    )
-
-
-class UniSpeechModelTester:
-    def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=1024,  # speech is longer
-            is_training=False,
-            hidden_size=16,
-            feat_extract_norm="group",
-            feat_extract_dropout=0.0,
-            feat_extract_activation="gelu",
-            conv_dim=(32, 32, 32),
-            conv_stride=(4, 4, 4),
-            conv_kernel=(8, 8, 8),
-            conv_bias=False,
-            num_conv_pos_embeddings=16,
-            num_conv_pos_embedding_groups=2,
-            num_hidden_layers=2,
-            num_attention_heads=2,
-            hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
-            intermediate_size=20,
-            layer_norm_eps=1e-5,
-            hidden_act="gelu",
-            initializer_range=0.02,
-            vocab_size=32,
-            do_stable_layer_norm=False,
-            scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.scope = scope
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length], scale=1.0)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        return config, input_values, attention_mask
-
-    def get_config(self):
-        return UniSpeechConfig(
-            hidden_size=self.hidden_size,
-            feat_extract_norm=self.feat_extract_norm,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-        )
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = UniSpeechModel(config=config)
-        # model.to()
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = UniSpeechModel(config=config)
-        # model.to()
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.bool_)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i]:] = 0.0
-            attention_mask[i, input_lengths[i]:] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i: i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i: i + 1, : output.shape[1]]
-            self.parent.assertTrue(np.allclose(output.asnumpy(), batch_output.asnumpy(), atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = UniSpeechForCTC(config=config)
-        # model.to()
-
-        # make sure that dropout is disabled
-        # model.eval()
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], int(min(max_length_labels) - 1)), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i]:] = 0.0
-            attention_mask[i, input_lengths[i]:] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_seq_classifier_loss(self, config, input_values, *args):
-        model = UniSpeechForSequenceClassification(config=config)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i]:] = 0.0
-            attention_mask[i, input_lengths[i]:] = 0
-
-        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-        unmasked_loss = model(input_values, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-
-    def check_ctc_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = UniSpeechForCTC(config=config)
-        # model.to()
-        model.set_train()
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], int(max(max_length_labels) - 2)), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i]:] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lengths are at least
-                # one shorter than logit lengths to prevent -inf
-                labels[i, max_length_labels[i] - 1:] = -100
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        # loss.backward()
-
-    def check_seq_classifier_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = UniSpeechForSequenceClassification(config=config)
-        model.set_train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i]:] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = UniSpeechForCTC(config)
-        model.set_train()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], int(max(max_length_labels) - 2)), model.config.vocab_size + 100)
-
-        with pytest.raises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class UniSpeechRobustModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (UniSpeechForCTC, UniSpeechModel, UniSpeechForSequenceClassification, UniSpeechForPreTraining)
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "audio-classification": UniSpeechForSequenceClassification,
-            "automatic-speech-recognition": UniSpeechForCTC,
-            "feature-extraction": UniSpeechModel,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_headmasking = False
-
-    def setUp(self):
-        self.model_tester = UniSpeechModelTester(
-            self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True
-        )
-        self.config_tester = ConfigTester(self, config_class=UniSpeechConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_batched_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # UniSpeech has no inputs_embeds
-    @unittest.skip(reason="UniSpeech has no inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_values`
-    @unittest.skip(reason="UniSpeech has no inputs_embeds")
-    def test_forward_signature(self):
-        pass
-
-    # UniSpeech cannot resize token embeddings
-    # since it has no tokens embeddings
-    @unittest.skip(reason="UniSpeech has no tokens embeds")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="UniSpeech has no inputs_embeds")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_values = inputs_dict["input_values"]
-
-        input_lengths = mindspore.tensor(
-            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=mindspore.int64
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_values.shape[0], int(output_lengths[0] - 2)), self.model_tester.vocab_size)
-        # labels = ids_tensor((input_values.shape[0], int(output_lengths[0] - 2)), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = ops.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    def test_mask_feature_prob_ctc(self):
-        model = UniSpeechForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech", mask_feature_prob=0.2, mask_feature_length=2,
-            ignore_mismatched_sizes=True
-        )
-        # model.to().train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech", return_attention_mask=True, ignore_mismatched_sizes=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = UniSpeechForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech", mask_time_prob=0.2, mask_time_length=2,
-            ignore_mismatched_sizes=True
-        )
-        # model.to().train()
-        model.set_train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech", return_attention_mask=True, ignore_mismatched_sizes=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    def test_mask_time_feature_prob_ctc_single_batch(self):
-        model = UniSpeechForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech",
-            mask_time_prob=0.2,
-            mask_feature_prob=0.2,
-            mask_time_length=2,
-            mask_feature_length=2,
-            ignore_mismatched_sizes=True
-        )
-        model.set_train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech", return_attention_mask=True, ignore_mismatched_sizes=True
-        )
-
-        batch_duration_in_seconds = [6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (1, 1498, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = UniSpeechModel.from_pretrained("microsoft/unispeech-large-1500h-cv", ignore_mismatched_sizes=True)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-@require_soundfile
-@slow
-class UniSpeechModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
-        return ds[:num_samples]
-
-    @slow
-    def test_inference_pretraining(self):
-        model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv",
-                                                        ignore_mismatched_sizes=True)
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53",
-                                                                     ignore_mismatched_sizes=True)
-        input_speech = self._load_datasamples(2)
-
-        inputs_dict = feature_extractor(input_speech, return_tensors="ms", padding=True)
-
-        with no_grad():
-            set_seed(0)
-            outputs = model(
-                inputs_dict.input_values,
-                attention_mask=inputs_dict.attention_mask,
-            )
-
-        # compute cosine similarity
-        cosine_sim = F.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
-
-        # pretrained model should have learned a high cosine similarity
-        self.assertTrue(cosine_sim.mean() > 0.5)
-
-        # fmt: off
-        expected_cosine_sim_slice = mindspore.tensor(
-            [[0.8290, 0.8335, 0.8815, 0.8580, 0.8249],
-             [0.8892, 0.9221, 0.8711, 0.8601, 0.8482]],
-        )
-        # fmt: on
-
-        self.assertTrue(np.allclose(cosine_sim[:, :5].asnumpy(), expected_cosine_sim_slice.asnumpy(), atol=1e-3))
diff --git a/tests/transformers/models/unispeech_sat/__init__.py b/tests/transformers/models/unispeech_sat/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/transformers/models/unispeech_sat/test_modeling_unispeech_sat.py
deleted file mode 100644
index ee76481ea..000000000
--- a/tests/transformers/models/unispeech_sat/test_modeling_unispeech_sat.py
+++ /dev/null
@@ -1,929 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore UniSpeechSat model."""
-
-import math
-import unittest
-
-import numpy as np
-import pytest
-from datasets import load_dataset
-
-from mindnlp.transformers import UniSpeechSatConfig
-from mindnlp.core.nn.functional import normalize
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    require_soundfile,
-    require_mindspore,
-    slow
-)
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        UniSpeechSatForAudioFrameClassification,
-        UniSpeechSatForCTC,
-        UniSpeechSatForPreTraining,
-        UniSpeechSatForSequenceClassification,
-        UniSpeechSatForXVector,
-        UniSpeechSatModel,
-        Wav2Vec2FeatureExtractor,
-        Wav2Vec2Processor,
-    )
-
-
-class UniSpeechSatModelTester:
-    def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=1024,  # speech is longer
-            is_training=False,
-            hidden_size=16,
-            feat_extract_norm="group",
-            feat_extract_dropout=0.0,
-            feat_extract_activation="gelu",
-            conv_dim=(32, 32, 32),
-            conv_stride=(4, 4, 4),
-            conv_kernel=(8, 8, 8),
-            conv_bias=False,
-            num_conv_pos_embeddings=16,
-            num_conv_pos_embedding_groups=2,
-            num_hidden_layers=2,
-            num_attention_heads=2,
-            hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
-            intermediate_size=20,
-            layer_norm_eps=1e-5,
-            hidden_act="gelu",
-            initializer_range=0.02,
-            mask_time_prob=0.5,
-            mask_time_length=2,
-            vocab_size=32,
-            do_stable_layer_norm=False,
-            tdnn_dim=(32, 32),
-            tdnn_kernel=(3, 3),
-            tdnn_dilation=(1, 1),
-            xvector_output_dim=32,
-            scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.mask_time_prob = mask_time_prob
-        self.mask_time_length = mask_time_length
-        self.tdnn_dim = tdnn_dim
-        self.tdnn_kernel = tdnn_kernel
-        self.tdnn_dilation = tdnn_dilation
-        self.xvector_output_dim = xvector_output_dim
-        self.scope = scope
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length], scale=1.0)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        return config, input_values, attention_mask
-
-    def get_config(self):
-        return UniSpeechSatConfig(
-            hidden_size=self.hidden_size,
-            feat_extract_norm=self.feat_extract_norm,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            mask_time_prob=self.mask_time_prob,
-            mask_time_length=self.mask_time_length,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            tdnn_dim=self.tdnn_dim,
-            tdnn_kernel=self.tdnn_kernel,
-            tdnn_dilation=self.tdnn_dilation,
-            xvector_output_dim=self.xvector_output_dim,
-        )
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = UniSpeechSatModel(config=config)
-        model.set_train(False)
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        model = UniSpeechSatModel(config=config)
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.bool_)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i]:] = 0.0
-            attention_mask[i, input_lengths[i]:] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i: i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i: i + 1, : output.shape[1]]
-            self.parent.assertTrue(np.allclose(output.asnumpy(), batch_output.asnumpy(), atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = UniSpeechSatForCTC(config=config)
-
-        # make sure that dropout is disabled
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], int(min(max_length_labels) - 1)), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i]:] = 0.0
-            attention_mask[i, input_lengths[i]:] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_seq_classifier_loss(self, config, input_values, *args):
-        model = UniSpeechSatForSequenceClassification(config=config)
-
-        # make sure that dropout is disabled
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i]:] = 0.0
-            attention_mask[i, input_lengths[i]:] = 0
-
-        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-        unmasked_loss = model(input_values, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-
-    def check_ctc_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = UniSpeechSatForCTC(config=config)
-        model.set_train()
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], int(max(max_length_labels) - 2)), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i]:] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lengths are at least
-                # one shorter than logit lengths to prevent -inf
-                labels[i, max_length_labels[i] - 1:] = -100
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        # gradient_function = mindspore.grad(model, grad_position=None, weights=model.trainable_params())
-        # g = gradient_function(input_values, labels=labels)
-        # self.parent.assertIsNotNone(g)
-
-        # loss.backward()
-
-    def check_seq_classifier_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = UniSpeechSatForSequenceClassification(config=config)
-        model.set_train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i]:] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        # loss.backward()
-
-    def check_xvector_training(self, config, *args):
-        config.ctc_zero_infinity = True
-        model = UniSpeechSatForXVector(config=config)
-
-        model.set_train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        # use a longer sequence length to account for TDNN temporal downsampling
-        input_values = floats_tensor([self.batch_size, self.seq_length * 2], scale=1.0)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i]:] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        # loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = UniSpeechSatForCTC(config)
-
-        model.set_train()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], int(max(max_length_labels) - 2)), model.config.vocab_size + 100)
-
-        with pytest.raises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class UniSpeechSatModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            UniSpeechSatForCTC,
-            UniSpeechSatForPreTraining,
-            UniSpeechSatModel,
-            UniSpeechSatForSequenceClassification,
-            UniSpeechSatForAudioFrameClassification,
-            UniSpeechSatForXVector,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "audio-classification": UniSpeechSatForSequenceClassification,
-            "automatic-speech-recognition": UniSpeechSatForCTC,
-            "feature-extraction": UniSpeechSatModel,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_headmasking = False
-
-    def setUp(self):
-        self.model_tester = UniSpeechSatModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=UniSpeechSatConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_xvector_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_xvector_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    @unittest.skip(reason="Model has no input_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Model has input_values instead of input_ids")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip(reason="Model has no tokens embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Model has no input_embeds")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="UniSpeechSat does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_values = inputs_dict["input_values"]
-
-        input_lengths = mindspore.tensor(
-            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=mindspore.int64
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_values.shape[0], int(output_lengths[0] - 2)), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = ops.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        # hidden_states.retain_grad()
-        # attentions.retain_grad()
-        #
-        # output.flatten()[0].backward(retain_graph=True)
-        #
-        # self.assertIsNotNone(hidden_states.grad)
-        # self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "label_embeddings_concat",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    def test_mask_feature_prob_ctc(self):
-        model = UniSpeechSatForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech-sat", mask_feature_prob=0.2, mask_feature_length=2
-        )
-        model.set_train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech-sat", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = UniSpeechSatForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech-sat", mask_time_prob=0.2, mask_time_length=2
-        )
-        model.set_train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech-sat", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = UniSpeechSatModel.from_pretrained("microsoft/unispeech-sat-base-plus")
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class UniSpeechSatRobustModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (UniSpeechSatForCTC, UniSpeechSatForPreTraining, UniSpeechSatModel, UniSpeechSatForSequenceClassification)
-        if is_mindspore_available()
-        else ()
-    )
-    test_pruning = False
-    test_headmasking = False
-
-    def setUp(self):
-        self.model_tester = UniSpeechSatModelTester(
-            self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True
-        )
-        self.config_tester = ConfigTester(self, config_class=UniSpeechSatConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_batched_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    @unittest.skip(reason="Model has no input_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Model has input_values instead of input_ids")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip(reason="Model has no tokens embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Model has no input_embeds")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="UniSpeechSat does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_values = inputs_dict["input_values"]
-
-        input_lengths = mindspore.tensor(
-            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=mindspore.int64
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_values.shape[0], int(output_lengths[0] - 2)), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = ops.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        # hidden_states.retain_grad()
-        # attentions.retain_grad()
-        #
-        # output.flatten()[0].backward(retain_graph=True)
-        #
-        # self.assertIsNotNone(hidden_states.grad)
-        # self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "label_embeddings_concat",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    def test_mask_feature_prob_ctc(self):
-        model = UniSpeechSatForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech-sat", mask_feature_prob=0.2, mask_feature_length=2
-        )
-        model.set_train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech-sat", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = UniSpeechSatForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech-sat", mask_time_prob=0.2, mask_time_length=2
-        )
-        model.set_train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech-sat", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    def test_mask_time_feature_prob_ctc_single_batch(self):
-        model = UniSpeechSatForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech-sat",
-            mask_time_prob=0.2,
-            mask_feature_prob=0.2,
-            mask_time_length=2,
-            mask_feature_length=2,
-        )
-        model.set_train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-unispeech-sat", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (1, 1498, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = UniSpeechSatModel.from_pretrained("microsoft/unispeech-sat-large")
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-@require_soundfile
-@slow
-class UniSpeechSatModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset(
-            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
-        )
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
-
-        return ds[:num_samples]
-
-    def test_inference_encoder_base(self):
-        model = UniSpeechSatModel.from_pretrained("microsoft/unispeech-sat-base-plus")
-
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-            "facebook/wav2vec2-base", return_attention_mask=True
-        )
-        input_speech = self._load_datasamples(2)
-
-        inputs_dict = feature_extractor(input_speech, return_tensors="ms", padding=True)
-
-        outputs = model(
-            inputs_dict.input_values,
-            attention_mask=inputs_dict.attention_mask,
-        )
-
-        # fmt: off
-        expected_hidden_states_slice = mindspore.tensor(
-            [[[-0.0743, 0.1384],
-              [-0.0845, 0.1704]],
-             [[-0.0954, 0.1936],
-              [-0.1123, 0.2095]]],
-        )
-        # fmt: on
-
-        self.assertTrue(
-            np.allclose(outputs.last_hidden_state[:, :2, -2:].asnumpy(), expected_hidden_states_slice.asnumpy(),
-                        atol=1e-3))
-
-    def test_inference_encoder_large(self):
-        model = UniSpeechSatModel.from_pretrained("microsoft/unispeech-sat-large")
-
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
-        input_speech = self._load_datasamples(2)
-
-        inputs_dict = feature_extractor(input_speech, return_tensors="ms", padding=True)
-
-        outputs = model(
-            inputs_dict.input_values,
-            attention_mask=inputs_dict.attention_mask,
-        )
-
-        # fmt: off
-        expected_hidden_states_slice = mindspore.tensor(
-            [[[-0.1172, -0.0797],
-              [-0.0012, 0.0213]],
-             [[-0.1225, -0.1277],
-              [-0.0668, -0.0585]]],
-        )
-        # fmt: on
-
-        self.assertTrue(
-            np.allclose(outputs.last_hidden_state[:, :2, -2:].asnumpy(), expected_hidden_states_slice.asnumpy(),
-                        atol=1e-3))
-
-    def test_inference_diarization(self):
-        model = UniSpeechSatForAudioFrameClassification.from_pretrained("microsoft/unispeech-sat-base-plus-sd")
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/unispeech-sat-base-plus-sd")
-        input_data = self._load_superb("sd", 4)
-        inputs = processor(input_data["speech"], return_tensors="ms", padding=True, sampling_rate=16_000)
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-        outputs = model(input_values, attention_mask=attention_mask)
-        # labels is a one-hot array of shape (num_frames, num_speakers)
-        labels = (outputs.logits > 0).long()
-
-        # s3prl logits for the same batch
-        expected_logits = mindspore.tensor(
-            [
-                [[-5.6119, -5.5845], [-3.7772, -5.4824], [-3.6914, -5.1619], [-4.7560, -5.0496]],
-                [[-6.3785, -4.8365], [-5.5863, -5.4149], [-5.5639, -4.8469], [-6.1511, -4.0052]],
-                [[-6.0355, -3.7414], [-5.5968, -4.8061], [-5.4620, -4.7310], [-5.5864, -4.6078]],
-                [[-5.9493, -4.8963], [-4.4050, -5.4476], [-4.1755, -5.1395], [-4.0272, -4.3705]],
-            ],
-        )
-        self.assertEqual(labels[0, :, 0].sum(), 270)
-        self.assertEqual(labels[0, :, 1].sum(), 647)
-        self.assertTrue(np.allclose(outputs.logits[:, :4].asnumpy(), expected_logits.asnumpy(), atol=1e-2))
-
-    def test_inference_speaker_verification(self):
-        model = UniSpeechSatForXVector.from_pretrained("microsoft/unispeech-sat-base-plus-sv")
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/unispeech-sat-base-plus-sv")
-        input_data = self._load_superb("si", 4)
-
-        inputs = processor(input_data["speech"], return_tensors="ms", padding=True)
-        labels = mindspore.tensor([5, 1, 1, 3]).T
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-        outputs = model(input_values, attention_mask=attention_mask, labels=labels)
-        embeddings = normalize(outputs.embeddings, dim=-1)
-
-        # id10002 vs id10002
-        self.assertAlmostEqual(ops.cosine_similarity(embeddings[1], embeddings[2], dim=-1).item(), 0.9671, 3)
-        # id10006 vs id10002
-        self.assertAlmostEqual(ops.cosine_similarity(embeddings[0], embeddings[1], dim=-1).item(), 0.4941, 3)
-        # id10002 vs id10004
-        self.assertAlmostEqual(ops.cosine_similarity(embeddings[2], embeddings[3], dim=-1).item(), 0.5616, 3)
-
-        self.assertAlmostEqual(outputs.loss.item(), 18.5925, 2)
diff --git a/tests/transformers/models/univnet/__init__.py b/tests/transformers/models/univnet/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/univnet/test_feature_extraction_univnet.py b/tests/transformers/models/univnet/test_feature_extraction_univnet.py
deleted file mode 100644
index f5cf0ada0..000000000
--- a/tests/transformers/models/univnet/test_feature_extraction_univnet.py
+++ /dev/null
@@ -1,368 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import itertools
-import os
-import random
-import tempfile
-import unittest
-
-import numpy as np
-from datasets import Audio, load_dataset
-
-from mindnlp.transformers import UnivNetFeatureExtractor
-from mindnlp.utils.testing_utils import check_json_file_has_correct_format, require_mindspore, slow
-from mindnlp.utils import is_mindspore_available
-
-from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindspore import ops
-
-
-global_rng = random.Random()
-
-
-# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-
-class UnivNetFeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        min_seq_length=400,
-        max_seq_length=2000,
-        feature_size=1,
-        sampling_rate=24000,
-        padding_value=0.0,
-        do_normalize=True,
-        num_mel_bins=100,
-        hop_length=256,
-        win_length=1024,
-        win_function="hann_window",
-        filter_length=1024,
-        max_length_s=10,
-        fmin=0.0,
-        fmax=12000,
-        mel_floor=1e-9,
-        center=False,
-        compression_factor=1.0,
-        compression_clip_val=1e-5,
-        normalize_min=-11.512925148010254,
-        normalize_max=2.3143386840820312,
-        model_in_channels=64,
-        pad_end_length=10,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.min_seq_length = min_seq_length
-        self.max_seq_length = max_seq_length
-        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
-
-        self.feature_size = feature_size
-        self.sampling_rate = sampling_rate
-        self.padding_value = padding_value
-        self.do_normalize = do_normalize
-        self.num_mel_bins = num_mel_bins
-        self.hop_length = hop_length
-        self.win_length = win_length
-        self.win_function = win_function
-        self.filter_length = filter_length
-        self.max_length_s = max_length_s
-        self.fmin = fmin
-        self.fmax = fmax
-        self.mel_floor = mel_floor
-        self.center = center
-        self.compression_factor = compression_factor
-        self.compression_clip_val = compression_clip_val
-        self.normalize_min = normalize_min
-        self.normalize_max = normalize_max
-        self.model_in_channels = model_in_channels
-        self.pad_end_length = pad_end_length
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "feature_size": self.feature_size,
-            "sampling_rate": self.sampling_rate,
-            "padding_value": self.padding_value,
-            "do_normalize": self.do_normalize,
-            "num_mel_bins": self.num_mel_bins,
-            "hop_length": self.hop_length,
-            "win_length": self.win_length,
-            "win_function": self.win_function,
-            "filter_length": self.filter_length,
-            "max_length_s": self.max_length_s,
-            "fmin": self.fmin,
-            "fmax": self.fmax,
-            "mel_floor": self.mel_floor,
-            "center": self.center,
-            "compression_factor": self.compression_factor,
-            "compression_clip_val": self.compression_clip_val,
-            "normalize_min": self.normalize_min,
-            "normalize_max": self.normalize_max,
-            "model_in_channels": self.model_in_channels,
-            "pad_end_length": self.pad_end_length,
-        }
-
-    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
-        def _flatten(list_of_lists):
-            return list(itertools.chain(*list_of_lists))
-
-        if equal_length:
-            speech_inputs = floats_list((self.batch_size, self.max_seq_length))
-        else:
-            # make sure that inputs increase in size
-            speech_inputs = [
-                _flatten(floats_list((x, self.feature_size)))
-                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
-            ]
-
-        if numpify:
-            speech_inputs = [np.asarray(x) for x in speech_inputs]
-
-        return speech_inputs
-
-
-class UnivNetFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = UnivNetFeatureExtractor
-
-    def setUp(self):
-        self.feat_extract_tester = UnivNetFeatureExtractionTester(self)
-
-    # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_from_and_save_pretrained
-    def test_feat_extract_from_and_save_pretrained(self):
-        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
-            check_json_file_has_correct_format(saved_file)
-            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
-
-        dict_first = feat_extract_first.to_dict()
-        dict_second = feat_extract_second.to_dict()
-        mel_1 = feat_extract_first.mel_filters
-        mel_2 = feat_extract_second.mel_filters
-        self.assertTrue(np.allclose(mel_1, mel_2))
-        self.assertEqual(dict_first, dict_second)
-
-    # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_to_json_file
-    def test_feat_extract_to_json_file(self):
-        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
-            feat_extract_first.to_json_file(json_file_path)
-            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
-
-        dict_first = feat_extract_first.to_dict()
-        dict_second = feat_extract_second.to_dict()
-        mel_1 = feat_extract_first.mel_filters
-        mel_2 = feat_extract_second.mel_filters
-        self.assertTrue(np.allclose(mel_1, mel_2))
-        self.assertEqual(dict_first, dict_second)
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        # create three inputs of length 800, 1000, and 1200
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        # Test feature size
-        input_features = feature_extractor(
-            np_speech_inputs, padding="max_length", max_length=1600, return_tensors="np"
-        ).input_features
-        self.assertTrue(input_features.ndim == 3)
-        # Note: for some reason I get a weird padding error when feature_size > 1
-        # self.assertTrue(input_features.shape[-2] == feature_extractor.feature_size)
-        # Note: we use the shape convention (batch_size, seq_len, num_mel_bins)
-        self.assertTrue(input_features.shape[-1] == feature_extractor.num_mel_bins)
-
-        # Test not batched input
-        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
-        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
-
-        # Test batched
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-        # Test 2-D numpy arrays are batched.
-        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
-        np_speech_inputs = np.asarray(speech_inputs)
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-        # Test truncation required
-        speech_inputs = [
-            floats_list((1, x))[0]
-            for x in range((feature_extractor.num_max_samples - 100), (feature_extractor.num_max_samples + 500), 200)
-        ]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        speech_inputs_truncated = [x[: feature_extractor.num_max_samples] for x in speech_inputs]
-        np_speech_inputs_truncated = [np.asarray(speech_input) for speech_input in speech_inputs_truncated]
-
-        encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(np_speech_inputs_truncated, return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-    def test_batched_unbatched_consistency(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        speech_inputs = floats_list((1, 800))[0]
-        np_speech_inputs = np.asarray(speech_inputs)
-
-        # Test unbatched vs batched list
-        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor([speech_inputs], return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-        # Test np.ndarray vs List[np.ndarray]
-        encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor([np_speech_inputs], return_tensors="np").input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-        # Test unbatched np.ndarray vs batched np.ndarray
-        encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
-        encoded_sequences_2 = feature_extractor(
-            np.expand_dims(np_speech_inputs, axis=0), return_tensors="np"
-        ).input_features
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-    def test_generate_noise(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-
-        features = feature_extractor(speech_inputs, return_noise=True)
-        input_features = features.input_features
-        noise_features = features.noise_sequence
-
-        for spectrogram, noise in zip(input_features, noise_features):
-            self.assertEqual(spectrogram.shape[0], noise.shape[0])
-
-    def test_pad_end(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-
-        input_features1 = feature_extractor(speech_inputs, padding=False, pad_end=False).input_features
-        input_features2 = feature_extractor(speech_inputs, padding=False, pad_end=True).input_features
-
-        for spectrogram1, spectrogram2 in zip(input_features1, input_features2):
-            self.assertEqual(spectrogram1.shape[0] + self.feat_extract_tester.pad_end_length, spectrogram2.shape[0])
-
-    def test_generate_noise_and_pad_end(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-
-        features = feature_extractor(speech_inputs, padding=False, return_noise=True, pad_end=True)
-        input_features = features.input_features
-        noise_features = features.noise_sequence
-
-        for spectrogram, noise in zip(input_features, noise_features):
-            self.assertEqual(spectrogram.shape[0], noise.shape[0])
-
-    @require_mindspore
-    def test_batch_decode(self):
-        import mindspore as ms
-
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
-        input_lengths = list(range(800, 1400, 200))
-        pad_samples = feature_extractor.pad_end_length * feature_extractor.hop_length
-        output_features = {
-            "waveforms": ms.tensor(floats_list((3, max(input_lengths) + pad_samples))),
-            "waveform_lengths": ms.tensor(input_lengths),
-        }
-        waveforms = feature_extractor.batch_decode(**output_features)
-
-        for input_length, waveform in zip(input_lengths, waveforms):
-            self.assertTrue(len(waveform.shape) == 1, msg="Individual output waveforms should be 1D")
-            self.assertEqual(waveform.shape[0], input_length)
-
-    @require_mindspore
-    # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_double_precision_pad
-    def test_double_precision_pad(self):
-        import mindspore as ms
-
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
-        py_speech_inputs = np_speech_inputs.tolist()
-
-        for inputs in [py_speech_inputs, np_speech_inputs]:
-            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
-            self.assertTrue(np_processed.input_features.dtype == np.float32)
-            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="ms")
-            self.assertTrue(pt_processed.input_features.dtype == ms.float32)
-
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset(
-            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
-        )
-        ds = ds.cast_column("audio", Audio(sampling_rate=self.feat_extract_tester.sampling_rate))
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
-
-    @slow
-    @require_mindspore
-    def test_integration(self):
-        # fmt: off
-        EXPECTED_INPUT_FEATURES = ms.tensor(
-            [
-                -5.0229, -6.1358, -5.8346, -5.4447, -5.6707, -5.8577, -5.0464, -5.0058,
-                -5.6015, -5.6410, -5.4325, -5.6116, -5.3700, -5.7956, -5.3196, -5.3274,
-                -5.9655, -5.6057, -5.8382, -5.9602, -5.9005, -5.9123, -5.7669, -6.1441,
-                -5.5168, -5.1405, -5.3927, -6.0032, -5.5784, -5.3728
-            ],
-        )
-        # fmt: on
-
-        input_speech, sr = self._load_datasamples(1)
-
-        feature_extractor = UnivNetFeatureExtractor()
-        input_features = feature_extractor(input_speech, sampling_rate=sr[0], return_tensors="ms").input_features
-        self.assertEqual(input_features.shape, (1, 548, 100))
-
-        input_features_mean = ops.mean(input_features)
-        input_features_stddev = ops.std(input_features)
-
-        EXPECTED_MEAN = ms.tensor(-6.18862009)
-        EXPECTED_STDDEV = ms.tensor(2.80845642)
-
-        self.assertTrue(np.allclose(input_features_mean.asnumpy(), EXPECTED_MEAN.asnumpy(), atol=5e-5, rtol=5e-6))
-        self.assertTrue(np.allclose(input_features_stddev.asnumpy(), EXPECTED_STDDEV.asnumpy()))
-        self.assertTrue(np.allclose(input_features[0, :30, 0].asnumpy(), EXPECTED_INPUT_FEATURES.asnumpy(), atol=1e-4, rtol=1e-5))
diff --git a/tests/transformers/models/univnet/test_modeling_univnet.py b/tests/transformers/models/univnet/test_modeling_univnet.py
deleted file mode 100644
index 0b2821107..000000000
--- a/tests/transformers/models/univnet/test_modeling_univnet.py
+++ /dev/null
@@ -1,438 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import inspect
-import random
-import unittest
-import numpy as np
-from datasets import Audio, load_dataset
-
-from mindnlp.transformers import UnivNetConfig, UnivNetFeatureExtractor
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    require_mindspore,
-    slow,
-)
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    floats_tensor,
-)
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindspore import ops
-
-    from mindnlp.transformers import UnivNetModel
-
-
-class UnivNetModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        seq_length=7,
-        in_channels=8,
-        hidden_channels=8,
-        num_mel_bins=20,
-        kernel_predictor_hidden_channels=8,
-        seed=0,
-        is_training=False,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.in_channels = in_channels
-        self.hidden_channels = hidden_channels
-        self.num_mel_bins = num_mel_bins
-        self.kernel_predictor_hidden_channels = kernel_predictor_hidden_channels
-        self.seed = seed
-        self.is_training = is_training
-
-    def prepare_noise_sequence(self):
-        noise_shape = (self.batch_size, self.seq_length, self.in_channels)
-        # Create noise on CPU for reproducibility
-        noise_sequence = ops.randn(noise_shape, seed=self.seed, dtype=ms.float32)
-        return noise_sequence
-
-    def prepare_config_and_inputs(self):
-        spectrogram = floats_tensor(
-            [self.batch_size, self.seq_length, self.num_mel_bins], scale=1.0
-        )
-        noise_sequence = self.prepare_noise_sequence()
-        # noise_sequence = noise_sequence
-        config = self.get_config()
-        return config, spectrogram, noise_sequence
-
-    def get_config(self):
-        return UnivNetConfig(
-            model_in_channels=self.in_channels,
-            model_hidden_channels=self.hidden_channels,
-            num_mel_bins=self.num_mel_bins,
-            kernel_predictor_hidden_channels=self.kernel_predictor_hidden_channels,
-        )
-
-    def create_and_check_model(self, config, spectrogram, noise_sequence):
-        model = UnivNetModel(config=config).set_train(False)
-        result = model(spectrogram, noise_sequence)[0]
-        self.parent.assertEqual(result.shape, (self.batch_size, self.seq_length * 256))
-
-    def prepare_config_and_inputs_for_common(self):
-        config, spectrogram, noise_sequence = self.prepare_config_and_inputs()
-        inputs_dict = {"input_features": spectrogram, "noise_sequence": noise_sequence}
-        return config, inputs_dict
-
-
-@require_mindspore
-class UnivNetModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (UnivNetModel,) if is_mindspore_available() else ()
-    # UnivNetModel currently cannot be traced with torch.jit.trace.
-    test_torchscript = False
-    # The UnivNetModel is not a transformer and does not use any attention mechanisms, so skip transformer/attention
-    # related tests.
-    test_pruning = False
-    test_resize_embeddings = False
-    test_resize_position_embeddings = False
-    test_head_masking = False
-    # UnivNetModel is not a sequence classification model.
-    test_mismatched_shapes = False
-    # UnivNetModel does not have a base_model_prefix attribute.
-    test_missing_keys = False
-    # UnivNetModel does not implement a parallelize method.
-    test_model_parallel = False
-    is_encoder_decoder = False
-    has_attentions = False
-
-    input_name = "input_features"
-
-    def setUp(self):
-        self.model_tester = UnivNetModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=UnivNetConfig,
-            has_text_modality=False,
-            common_properties=["num_mel_bins"],
-        )
-
-    @unittest.skip(reason="fix this once it gets more usage")
-    def test_multi_gpu_data_parallel_forward(self):
-        super().test_multi_gpu_data_parallel_forward()
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_features",
-            ]
-            self.assertListEqual(
-                arg_names[: len(expected_arg_names)], expected_arg_names
-            )
-
-    @unittest.skip(reason="UnivNetModel does not output hidden_states.")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(
-        reason="UnivNetModel.forward does not accept an inputs_embeds argument."
-    )
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(
-        reason="UnivNetModel does not use input embeddings and thus has no get_input_embeddings method."
-    )
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(
-        reason="UnivNetModel does not support all arguments tested, such as output_hidden_states."
-    )
-    def test_model_outputs_equivalence(self):
-        pass
-
-    @unittest.skip(reason="UnivNetModel does not output hidden_states.")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    def test_batched_inputs_outputs(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-
-            batched_spectrogram = inputs["input_features"]
-            batched_noise_sequence = inputs["noise_sequence"]
-
-            batched_outputs = model(
-                batched_spectrogram,
-                batched_noise_sequence,
-            )[0]
-
-            self.assertEqual(
-                batched_spectrogram.shape[0],
-                batched_outputs.shape[0],
-                msg="Got different batch dims for input and output",
-            )
-
-    def test_unbatched_inputs_outputs(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(inputs["input_features"][:1], inputs["noise_sequence"][:1])[
-                0
-            ]
-            self.assertTrue(
-                outputs.shape[0] == 1,
-                msg="Unbatched input should create batched output with bsz = 1",
-            )
-
-
-@slow
-class UnivNetModelIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-
-    def _load_datasamples(self, num_samples, sampling_rate=24000):
-        ds = load_dataset(
-            "hf-internal-testing/librispeech_asr_dummy",
-            "clean",
-            split="validation",
-            trust_remote_code=True,
-        )
-        ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate))
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples], [
-            x["sampling_rate"] for x in speech_samples
-        ]
-
-    def get_inputs(self, num_samples: int = 3, noise_length: int = 10, seed: int = 0):
-        # Note: hardcode model_in_channels -> 64
-        if num_samples == 1:
-            noise_sequence_shape = (64, noise_length)
-        else:
-            noise_sequence_shape = (num_samples, 64, noise_length)
-        # Explicity generate noise_sequence on CPU for consistency.
-        noise_sequence = ops.randn(noise_sequence_shape, seed=seed, dtype=ms.float32)
-        # Put noise_sequence on the desired device.
-        noise_sequence = noise_sequence
-
-        # Note: hardcode num_mel_channels -> 100
-        if num_samples == 1:
-            spectrogram_shape = [100, noise_length]
-        else:
-            spectrogram_shape = [num_samples, 100, noise_length]
-        spectrogram = floats_tensor(
-            spectrogram_shape, scale=1.0, rng=random.Random(seed)
-        )
-        # Note: spectrogram should already be on torch_device
-
-        # Permute to match diffusers implementation
-        if num_samples == 1:
-            noise_sequence = noise_sequence.swapaxes(1, 0)
-            spectrogram = spectrogram.swapaxes(1, 0)
-        else:
-            noise_sequence = noise_sequence.swapaxes(2, 1)
-            spectrogram = spectrogram.swapaxes(2, 1)
-
-        inputs = {
-            "input_features": spectrogram,
-            "noise_sequence": noise_sequence,
-        }
-
-        return inputs
-
-    def test_model_inference_batched(self):
-        # Load sample checkpoint from Tortoise TTS
-        model = UnivNetModel.from_pretrained("dg845/univnet-dev", from_pt=True)
-        model.set_train(False)
-
-        # Get batched noise and spectrogram inputs.
-        input_speech = self.get_inputs(num_samples=3)
-
-        waveform = model(**input_speech)[0]
-        waveform_mean = ops.mean(waveform)
-        waveform_stddev = ops.std(waveform)
-        waveform_slice = waveform[-1, -9:].flatten()
-
-        EXPECTED_MEAN = ms.tensor(-0.19989729)
-        EXPECTED_STDDEV = ms.tensor(0.35230172)
-        EXPECTED_SLICE = ms.tensor(
-            [
-                -0.3408,
-                -0.6045,
-                -0.5052,
-                0.1160,
-                -0.1556,
-                -0.0405,
-                -0.3024,
-                -0.5290,
-                -0.5019,
-            ]
-        )
-        print(
-            "111",
-            waveform_mean.asnumpy(),
-            waveform_stddev.asnumpy(),
-            waveform_slice.asnumpy(),
-        )
-        self.assertTrue(
-            np.allclose(
-                waveform_mean.asnumpy(), EXPECTED_MEAN.asnumpy(), atol=1e-3, rtol=1e-3
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                waveform_stddev.asnumpy(),
-                EXPECTED_STDDEV.asnumpy(),
-                atol=1e-4,
-                rtol=1e-5,
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                waveform_slice.asnumpy(), EXPECTED_SLICE.asnumpy(), atol=1e-4, rtol=1e-5
-            )
-        )
-
-    def test_model_inference_unbatched(self):
-        # Load sample checkpoint from Tortoise TTS
-        model = UnivNetModel.from_pretrained("dg845/univnet-dev", from_pt=True)
-        model.set_train(False)
-
-        # Get unbatched noise and spectrogram inputs.
-        input_speech = self.get_inputs(num_samples=1)
-
-        waveform = model(**input_speech)[0]
-        # waveform = waveform.cpu()
-
-        waveform_mean = ops.mean(waveform)
-        waveform_stddev = ops.std(waveform)
-        waveform_slice = waveform[-1, -9:].flatten()
-
-        EXPECTED_MEAN = ms.tensor(-0.22895093)
-        EXPECTED_STDDEV = ms.tensor(0.33986747)
-        EXPECTED_SLICE = ms.tensor(
-            [
-                -0.3276,
-                -0.5504,
-                -0.3484,
-                0.3574,
-                -0.0373,
-                -0.1826,
-                -0.4880,
-                -0.6431,
-                -0.5162,
-            ]
-        )
-        print(
-            "222",
-            waveform_mean.asnumpy(),
-            waveform_stddev.asnumpy(),
-            waveform_slice.asnumpy(),
-        )
-        self.assertTrue(
-            np.allclose(
-                waveform_mean.asnumpy(), EXPECTED_MEAN.asnumpy(), atol=1e-3, rtol=1e-3
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                waveform_stddev.asnumpy(),
-                EXPECTED_STDDEV.asnumpy(),
-                atol=1e-4,
-                rtol=1e-5,
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                waveform_slice.asnumpy(), EXPECTED_SLICE.asnumpy(), atol=1e-3, rtol=1e-5
-            )
-        )
-
-    def test_integration(self):
-        feature_extractor = UnivNetFeatureExtractor.from_pretrained("dg845/univnet-dev")
-        model = UnivNetModel.from_pretrained("dg845/univnet-dev")
-        model.set_train(False)
-        audio, sr = self._load_datasamples(
-            1, sampling_rate=feature_extractor.sampling_rate
-        )
-
-        input_features = feature_extractor(
-            audio, sampling_rate=sr[0], return_tensors="ms"
-        ).input_features
-        # input_features = input_features
-
-        input_speech = self.get_inputs(
-            num_samples=1, noise_length=input_features.shape[1]
-        )
-        input_speech["input_features"] = input_features
-
-        waveform = model(**input_speech)[0]
-        # waveform = waveform.cpu()
-
-        waveform_mean = ops.mean(waveform)
-        waveform_stddev = ops.std(waveform)
-        waveform_slice = waveform[-1, -9:].flatten()
-
-        EXPECTED_MEAN = ms.tensor(0.00051374)
-        EXPECTED_STDDEV = ms.tensor(0.058105603)
-        # fmt: off
-        EXPECTED_SLICE = ms.tensor([-4.3934e-04, -1.8203e-04, -3.3033e-04, -3.8716e-04, -1.6125e-04, 3.5389e-06, -3.3149e-04, -3.7613e-04, -2.3331e-04])
-        # fmt: on
-        print(
-            "333",
-            waveform_mean.asnumpy(),
-            waveform_stddev.asnumpy(),
-            waveform_slice.asnumpy(),
-        )
-        self.assertTrue(
-            np.allclose(
-                waveform_mean.asnumpy(), EXPECTED_MEAN.asnumpy(), atol=5e-6, rtol=1e-5
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                waveform_stddev.asnumpy(),
-                EXPECTED_STDDEV.asnumpy(),
-                atol=1e-4,
-                rtol=1e-5,
-            )
-        )
-        self.assertTrue(
-            np.allclose(
-                waveform_slice.asnumpy(), EXPECTED_SLICE.asnumpy(), atol=1e-3, rtol=1e-3
-            )
-        )
diff --git a/tests/transformers/models/upernet/__init__.py b/tests/transformers/models/upernet/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/upernet/test_modeling_upernet.py b/tests/transformers/models/upernet/test_modeling_upernet.py
deleted file mode 100644
index 82e305f8e..000000000
--- a/tests/transformers/models/upernet/test_modeling_upernet.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch UperNet framework."""
-
-import unittest
-
-from huggingface_hub import hf_hub_download
-
-from mindnlp.transformers import ConvNextConfig, UperNetConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-)
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import UperNetForSemanticSegmentation
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import AutoImageProcessor
-
-
-class UperNetModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=32,
-        num_channels=3,
-        num_stages=4,
-        hidden_sizes=[10, 20, 30, 40],
-        depths=[1, 1, 1, 1],
-        is_training=True,
-        use_labels=True,
-        intermediate_size=37,
-        hidden_act="gelu",
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        out_features=["stage2", "stage3", "stage4"],
-        num_labels=3,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.num_stages = num_stages
-        self.hidden_sizes = hidden_sizes
-        self.depths = depths
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.out_features = out_features
-        self.num_labels = num_labels
-        self.scope = scope
-        self.num_hidden_layers = num_stages
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_backbone_config(self):
-        return ConvNextConfig(
-            num_channels=self.num_channels,
-            num_stages=self.num_stages,
-            hidden_sizes=self.hidden_sizes,
-            depths=self.depths,
-            is_training=self.is_training,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            out_features=self.out_features,
-        )
-
-    def get_config(self):
-        return UperNetConfig(
-            backbone_config=self.get_backbone_config(),
-            backbone=None,
-            hidden_size=64,
-            pool_scales=[1, 2, 3, 6],
-            use_auxiliary_head=True,
-            auxiliary_loss_weight=0.4,
-            auxiliary_in_channels=40,
-            auxiliary_channels=32,
-            auxiliary_num_convs=1,
-            auxiliary_concat_input=False,
-            loss_ignore_index=255,
-            num_labels=self.num_labels,
-        )
-
-    def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels):
-        model = UperNetForSemanticSegmentation(config=config)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.num_labels, self.image_size, self.image_size)
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class UperNetModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as UperNet does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (UperNetForSemanticSegmentation,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"image-segmentation": UperNetForSemanticSegmentation} if is_mindspore_available() else {}
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_torchscript = False
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = UperNetModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=UperNetConfig,
-            has_text_modality=False,
-            hidden_size=37,
-            common_properties=["hidden_size"],
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_for_semantic_segmentation(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
-
-    @unittest.skip(reason="UperNet does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="UperNet does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="UperNet does not have a base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="UperNet does not have a base model")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(reason="UperNet has some layers using `add_module` which doesn't work well with `nn.DataParallel`")
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_stages = self.model_tester.num_stages
-            self.assertEqual(len(hidden_states), expected_num_stages + 1)
-
-            # ConvNext's feature maps are of shape (batch_size, num_channels, height, width)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [self.model_tester.image_size // 4, self.model_tester.image_size // 4],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        configs_no_init.backbone_config = _config_zero_init(configs_no_init.backbone_config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.data.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @unittest.skip(reason="UperNet does not have tied weights")
-    def test_tied_model_weights_key_ignore(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "openmmlab/upernet-convnext-tiny"
-        model = UperNetForSemanticSegmentation.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of ADE20k
-def prepare_img():
-    filepath = hf_hub_download(
-        repo_id="hf-internal-testing/fixtures_ade20k", repo_type="dataset", filename="ADE_val_00000001.jpg"
-    )
-    image = Image.open(filepath).convert("RGB")
-    return image
-
-
-@require_mindspore
-@require_vision
-@slow
-class UperNetModelIntegrationTest(unittest.TestCase):
-    def test_inference_swin_backbone(self):
-        processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-swin-tiny")
-        model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-swin-tiny")
-
-        image = prepare_img()
-        inputs = processor(images=image, return_tensors="ms")
-
-        with no_grad():
-            outputs = model(**inputs)
-
-        expected_shape = (1, model.config.num_labels, 512, 512)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[-7.5958, -7.5958, -7.4302], [-7.5958, -7.5958, -7.4302], [-7.4797, -7.4797, -7.3068]]
-        )
-        self.assertTrue(ops.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4))
-
-    def test_inference_convnext_backbone(self):
-        processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-tiny")
-        model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-tiny")
-
-        image = prepare_img()
-        inputs = processor(images=image, return_tensors="ms")
-
-        with no_grad():
-            outputs = model(**inputs)
-
-        expected_shape = (1, model.config.num_labels, 512, 512)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[-8.8110, -8.8110, -8.6521], [-8.8110, -8.8110, -8.6521], [-8.7746, -8.7746, -8.6130]]
-        )
-        self.assertTrue(ops.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/transformers/models/videomae/__init__.py b/tests/transformers/models/videomae/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/videomae/test_image_processing_videomae.py b/tests/transformers/models/videomae/test_image_processing_videomae.py
deleted file mode 100644
index de7137866..000000000
--- a/tests/transformers/models/videomae/test_image_processing_videomae.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_video_inputs
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import VideoMAEImageProcessor
-
-
-class VideoMAEImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        num_frames=10,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        crop_size=None,
-    ):
-        size = size if size is not None else {"shortest_edge": 18}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.num_frames = num_frames
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.crop_size = crop_size
-
-    def prepare_image_processor_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "crop_size": self.crop_size,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_frames, self.num_channels, self.crop_size["height"], self.crop_size["width"]
-
-    def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_video_inputs(
-            batch_size=self.batch_size,
-            num_frames=self.num_frames,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class VideoMAEImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = VideoMAEImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = VideoMAEImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "do_center_crop"))
-        self.assertTrue(hasattr(image_processing, "size"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 18})
-        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
-        self.assertEqual(image_processor.size, {"shortest_edge": 42})
-        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
-
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL videos
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], Image.Image)
-
-        # Test not batched input
-        encoded_videos = image_processing(video_inputs[0], return_tensors="ms").pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
-        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
-
-        # Test batched
-        encoded_videos = image_processing(video_inputs, return_tensors="ms").pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
-        self.assertEqual(
-            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
-        )
-
-    def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], np.ndarray)
-
-        # Test not batched input
-        encoded_videos = image_processing(video_inputs[0], return_tensors="ms").pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
-        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
-
-        # Test batched
-        encoded_videos = image_processing(video_inputs, return_tensors="ms").pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
-        self.assertEqual(
-            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
-        )
-
-    def test_call_numpy_4_channels(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        self.image_processor_tester.num_channels = 4
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], np.ndarray)
-
-        # Test not batched input
-        encoded_videos = image_processing(
-            video_inputs[0], return_tensors="ms", image_mean=0, image_std=1, input_data_format="channels_first"
-        ).pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
-        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
-
-        # Test batched
-        encoded_videos = image_processing(
-            video_inputs, return_tensors="ms", image_mean=0, image_std=1, input_data_format="channels_first"
-        ).pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
-        self.assertEqual(
-            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
-        )
-        self.image_processor_tester.num_channels = 3
-
-    def test_call_pytorch(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, torchify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], mindspore.Tensor)
-
-        # Test not batched input
-        encoded_videos = image_processing(video_inputs[0], return_tensors="ms").pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
-        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
-
-        # Test batched
-        encoded_videos = image_processing(video_inputs, return_tensors="ms").pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
-        self.assertEqual(
-            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
-        )
diff --git a/tests/transformers/models/videomae/test_modeling_videomae.py b/tests/transformers/models/videomae/test_modeling_videomae.py
deleted file mode 100644
index c06b824a6..000000000
--- a/tests/transformers/models/videomae/test_modeling_videomae.py
+++ /dev/null
@@ -1,398 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch VideoMAE model."""
-
-import copy
-import unittest
-
-import numpy as np
-from huggingface_hub import hf_hub_download
-
-from mindnlp.utils.generic import cached_property
-from mindnlp.transformers import VideoMAEConfig
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow,is_mindspore_available,is_vision_available
-from mindnlp.core.serialization import load
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import (
-        MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
-        VideoMAEForPreTraining,
-        VideoMAEForVideoClassification,
-        VideoMAEModel,
-    )
-
-
-if is_vision_available():
-    from mindnlp.transformers import VideoMAEImageProcessor
-
-
-class VideoMAEModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=10,
-        num_channels=3,
-        patch_size=2,
-        tubelet_size=2,
-        num_frames=2,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        mask_ratio=0.9,
-        scope=None,
-        attn_implementation="eager",
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.tubelet_size = tubelet_size
-        self.num_frames = num_frames
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.mask_ratio = mask_ratio
-        self.scope = scope
-        self.attn_implementation = attn_implementation
-
-        # in VideoMAE, the number of tokens equals num_frames/tubelet_size * num_patches per frame
-        self.num_patches_per_frame = (image_size // patch_size) ** 2
-        self.seq_length = (num_frames // tubelet_size) * self.num_patches_per_frame
-
-        # use this variable to define bool_masked_pos
-        self.num_masks = int(mask_ratio * self.seq_length)
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [self.batch_size, self.num_frames, self.num_channels, self.image_size, self.image_size]
-        )
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return VideoMAEConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            num_frames=self.num_frames,
-            tubelet_size=self.tubelet_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            decoder_hidden_size=self.hidden_size,
-            decoder_intermediate_size=self.intermediate_size,
-            decoder_num_attention_heads=self.num_attention_heads,
-            decoder_num_hidden_layers=self.num_hidden_layers,
-            attn_implementation=self.attn_implementation,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = VideoMAEModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_pretraining(self, config, pixel_values, labels):
-        model = VideoMAEForPreTraining(config)
-        model.set_train(False)
-        # important: each video needs to have the same number of masked patches
-        # hence we define a single mask, which we then repeat for each example in the batch
-        mask = ops.ones((self.num_masks,))
-        mask = ops.cat([mask, ops.zeros((self.seq_length - mask.shape[0]))])
-        bool_masked_pos = mask.broadcast_to((self.batch_size, -1)).bool()
-
-        result = model(pixel_values, bool_masked_pos)
-        # model only returns predictions for masked patches
-        num_masked_patches = mask.sum().item()
-        decoder_num_labels = 3 * self.tubelet_size * self.patch_size**2
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, num_masked_patches, decoder_num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-    
-@require_mindspore
-class VideoMAEModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as VideoMAE does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (VideoMAEForPreTraining, VideoMAEForVideoClassification) if is_mindspore_available() else ()
-    )
-    pipeline_model_mapping = (
-        {"feature-extraction": VideoMAEModel, "video-classification": VideoMAEForVideoClassification}
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_torchscript = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = VideoMAEModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=VideoMAEConfig, has_text_modality=False, hidden_size=37)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-
-        if model_class == VideoMAEForPreTraining:
-            # important: each video needs to have the same number of masked patches
-            # hence we define a single mask, which we then repeat for each example in the batch
-            mask = ops.ones((self.model_tester.num_masks,))
-            mask = ops.cat([mask, ops.zeros((self.model_tester.seq_length - mask.shape[0]))])
-            batch_size = inputs_dict["pixel_values"].shape[0]
-            bool_masked_pos = mask.broadcast_to((batch_size, -1)).bool()
-            inputs_dict["bool_masked_pos"] = bool_masked_pos
-        if return_labels:
-            if model_class in [
-                *get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING),
-            ]:
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size), dtype=mindspore.int64
-                )
-
-        return inputs_dict
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="VideoMAE does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "MCG-NJU/videomae-base"
-        model = VideoMAEModel.from_pretrained(model_name, from_pt=True)
-        self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        if not self.has_attentions:
-            pass
-
-        else:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            for model_class in self.all_model_classes:
-                num_visible_patches = self.model_tester.seq_length - self.model_tester.num_masks
-                seq_len = (
-                    num_visible_patches if model_class == VideoMAEForPreTraining else self.model_tester.seq_length
-                )
-
-                inputs_dict["output_attentions"] = True
-                inputs_dict["output_hidden_states"] = False
-                config.return_dict = True
-                model = model_class(config)
-                model.set_train(False)
-                with mindspore._no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-                attentions = outputs.attentions
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-                # check that output_attentions also work using config
-                del inputs_dict["output_attentions"]
-                config.output_attentions = True
-                model = model_class(config)
-                model.set_train(False)
-                with mindspore._no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-                attentions = outputs.attentions
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len, seq_len],
-                )
-                out_len = len(outputs)
-
-                # Check attention is always last and order is fine
-                inputs_dict["output_attentions"] = True
-                inputs_dict["output_hidden_states"] = True
-                model = model_class(config)
-                model.set_train(False)
-                with mindspore._no_grad():
-                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-                self.assertEqual(out_len + 1, len(outputs))
-
-                self_attentions = outputs.attentions
-
-                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len, seq_len],
-                )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-            with mindspore._no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-            expected_num_layers = self.model_tester.num_hidden_layers + 1
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            num_visible_patches = self.model_tester.seq_length - self.model_tester.num_masks
-            seq_length = num_visible_patches if model_class == VideoMAEForPreTraining else self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-
-# We will verify our results on a video of eating spaghetti
-# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
-def prepare_video():
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
-    )
-    video = np.load(file)
-    return list(video)
-
-
-@require_mindspore
-@require_vision
-class VideoMAEModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        # logits were tested with a different mean and std, so we use the same here
-        return (
-            VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_for_video_classification(self):
-        model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics", from_pt=True)
-        image_processor = self.default_image_processor
-        video = prepare_video()
-        inputs = image_processor(video, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 400)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([0.3669, -0.0688, -0.2421])
-        
-        self.assertTrue(np.allclose(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-3))
-
-    @slow
-    def test_inference_for_pretraining(self):
-        model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base-short", from_pt=True)
-        model.set_train(False)
-        image_processor = self.default_image_processor
-        video = prepare_video()
-        inputs = image_processor(video, return_tensors="ms")
-        local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
-        inputs["bool_masked_pos"] = load(local_path)
-        with mindspore._no_grad():
-            outputs = model(**inputs)
-        expected_shape = (1, 1408, 1536)
-        expected_slice = mindspore.tensor(
-            [[0.7994, 0.9612, 0.8508], [0.7401, 0.8958, 0.8302], [0.5862, 0.7468, 0.7325]])
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        self.assertTrue(np.allclose(outputs.logits[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1.5e-3))
-
-        # verify the loss (`config.norm_pix_loss` = `True`)
-        expected_loss = mindspore.tensor([0.5142])
-        self.assertTrue(np.allclose(outputs.loss.asnumpy(), expected_loss.asnumpy(), atol=1e-4))
-        # verify the loss (`config.norm_pix_loss` = `False`)
-        model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base-short", norm_pix_loss=False, from_pt=True)
-
-        with mindspore._no_grad():
-            outputs = model(**inputs)
-
-        expected_loss = mindspore.tensor([0.6469])
-        self.assertTrue(np.allclose(outputs.loss.asnumpy(), expected_loss.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/vilt/__init__.py b/tests/transformers/models/vilt/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/vilt/test_image_processing_vilt.py b/tests/transformers/models/vilt/test_image_processing_vilt.py
deleted file mode 100644
index e81acd44d..000000000
--- a/tests/transformers/models/vilt/test_image_processing_vilt.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the mindspore VilT model."""
-# pylint: disable=W0231
-# pylint: disable=E1102
-
-import unittest
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import ViltImageProcessor
-
-
-class ViltImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        size_divisor=2,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"shortest_edge": 30}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.size_divisor = size_divisor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_image_processor_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "size_divisor": self.size_divisor,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to ViltImageProcessor,
-        assuming do_resize is set to True with a scalar size and size_divisor.
-        """
-        if not batched:
-            size = self.size["shortest_edge"]
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                w, h = image.size
-            else:
-                h, w = image.shape[1], image.shape[2]
-            scale = size / min(w, h)
-            if h < w:
-                newh, neww = size, scale * w
-            else:
-                newh, neww = scale * h, size
-
-            max_size = int((1333 / 800) * size)
-            if max(newh, neww) > max_size:
-                scale = max_size / max(newh, neww)
-                newh = newh * scale
-                neww = neww * scale
-
-            newh, neww = int(newh + 0.5), int(neww + 0.5)
-            expected_height, expected_width = (
-                newh // self.size_divisor * self.size_divisor,
-                neww // self.size_divisor * self.size_divisor,
-            )
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-    def expected_output_image_shape(self, images):
-        height, width = self.get_expected_values(images, batched=True)
-        return (self.num_channels, height, width)
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class ViltImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = ViltImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = ViltImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-        self.assertTrue(hasattr(image_processing, "size_divisor"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 30})
-
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
-        self.assertEqual(image_processor.size, {"shortest_edge": 42})
diff --git a/tests/transformers/models/vilt/test_modeling_vilt.py b/tests/transformers/models/vilt/test_modeling_vilt.py
deleted file mode 100644
index bc48170d1..000000000
--- a/tests/transformers/models/vilt/test_modeling_vilt.py
+++ /dev/null
@@ -1,654 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the mindspore ViLT model."""
-import io
-# pylint: disable=W0231
-# pylint: disable=E1102
-
-import unittest
-
-from packaging import version
-
-from mindnlp.transformers import ViltConfig, ViltProcessor
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    is_vision_available,
-    is_mindspore_available,
-    slow
-)
-from mindnlp.utils import cached_property
-from mindnlp.dataset import load_dataset
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers.models.vilt import (
-        ViltForImageAndTextRetrieval,
-        ViltForImagesAndTextClassification,
-        ViltForMaskedLM,
-        ViltForQuestionAnswering,
-        ViltForTokenClassification,
-        ViltModel,
-    )
-    from mindnlp.transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES
-
-if is_vision_available():
-    import numpy as np
-    import PIL
-    from PIL import Image
-
-class ViltModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-        modality_type_vocab_size=2,
-        add_multiple_images=False,
-        num_images=-1,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.scope = scope
-        self.modality_type_vocab_size = modality_type_vocab_size
-        self.add_multiple_images = add_multiple_images
-        self.num_images = num_images
-        # we set the expected sequence length (which is used in several tests)
-        # this is equal to the seq length of the text tokens + number of image patches + 1 for the CLS token
-        self.expected_seq_len = self.seq_length + (self.image_size // self.patch_size) ** 2 + 1
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        if self.add_multiple_images:
-            pixel_values = floats_tensor([self.batch_size, 2, self.num_channels, self.image_size, self.image_size])
-        else:
-            pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        if self.use_labels:
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-
-        config = self.get_config()
-
-        return (config, input_ids, token_type_ids, input_mask, pixel_values, token_labels)
-
-    def get_config(self):
-        return ViltConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            num_labels=self.num_labels,
-            modality_type_vocab_size=self.modality_type_vocab_size,
-            num_images=self.num_images,
-        )
-
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        pixel_values,
-        token_labels,
-    ):
-        model = ViltModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, pixel_values=pixel_values)
-        result = model(input_ids, token_type_ids=token_type_ids, pixel_values=pixel_values)
-        result = model(input_ids, pixel_values=pixel_values)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.expected_seq_len, self.hidden_size)
-        )
-
-    def create_and_check_for_token_classification(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        pixel_values,
-        token_labels,
-    ):
-        model = ViltForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, pixel_values=pixel_values)
-        result = model(input_ids, token_type_ids=token_type_ids, pixel_values=pixel_values)
-        result = model(input_ids, pixel_values=pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            pixel_values,
-            token_labels,
-        ) = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": input_mask,
-            "pixel_values": pixel_values,
-        }
-        return config, inputs_dict
-
-    def prepare_pixel_values(self):
-        return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-
-@require_mindspore
-class ViltModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            ViltModel,
-            ViltForQuestionAnswering,
-            ViltForImageAndTextRetrieval,
-            ViltForMaskedLM,
-            ViltForTokenClassification,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"image-feature-extraction": ViltModel, "visual-question-answering": ViltForQuestionAnswering}
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_headmasking = False
-    test_mindsporescript = False
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    # ViltForMaskedLM, ViltForQuestionAnswering and ViltForImagesAndTextClassification require special treatment
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "ViltForQuestionAnswering":
-                inputs_dict["labels"] = ops.zeros(
-                    self.model_tester.batch_size, self.model_tester.num_labels
-                )
-            elif model_class.__name__ in ["ViltForMaskedLM", "ViltForTokenClassification"]:
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int64
-                )
-            elif model_class.__name__ == "ViltForImagesAndTextClassification":
-                inputs_dict["labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = ViltModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ViltConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            return
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            if model_class.__name__ == "ViltForImagesAndTextClassification":
-                config.modality_type_vocab_size = 3
-
-            # ViltForImageAndTextRetrieval doesn't support training for now
-            if model_class.__name__ in [*MODEL_MAPPING_NAMES.values(), "ViltForImageAndTextRetrieval"]:
-                continue
-
-            model = model_class(config)
-            model.set_train(True)
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            for k, v in inputs.items():
-                print(k, v.shape)
-            loss = model(**inputs).loss
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(
-        reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic
-                            hidden states"""
-    )
-    def test_save_load(self):
-        pass
-
-    @unittest.skip(
-        reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic
-                            hidden states"""
-    )
-    def test_determinism(self):
-        pass
-
-    @unittest.skip(
-        "VilT samples image tokens from a multinomial distribution, resulting in not deterministic hidden states"
-    )
-    def test_batching_equivalence(self):
-        pass
-
-    @unittest.skip(
-        reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic
-                            hidden states"""
-    )
-    def test_model_outputs_equivalence(self):
-        pass
-
-    @unittest.skip(
-        reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic
-                            hidden states. Cannot test equivalence on logit level"""
-    )
-    def test_inputs_embeds_matches_input_ids(self):
-        pass
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "expected_seq_len", None)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            if model_class.__name__ == "ViltForImagesAndTextClassification":
-                # attentions are a list of length num_images
-                # each element contains the attentions of a particular image index
-                self.assertEqual(len(attentions), self.model_tester.num_images)
-                self.assertEqual(len(attentions[0]), self.model_tester.num_hidden_layers)
-            else:
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            if model_class.__name__ == "ViltForImagesAndTextClassification":
-                # attentions are a list of length num_images
-                # each element contains the attentions of a particular image index
-                self.assertEqual(len(attentions), self.model_tester.num_images)
-                self.assertEqual(len(attentions[0]), self.model_tester.num_hidden_layers)
-            else:
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            if model_class.__name__ == "ViltForImagesAndTextClassification":
-                self.assertListEqual(
-                    list(attentions[0][0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len, seq_len],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len, seq_len],
-                )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            self.assertEqual(out_len + 1, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            if model_class.__name__ == "ViltForImagesAndTextClassification":
-                self.assertEqual(len(self_attentions), self.model_tester.num_images)
-                self.assertEqual(len(self_attentions[0]), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(self_attentions[0][0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len, seq_len],
-                )
-            else:
-                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, seq_len, seq_len],
-                )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            if model_class.__name__ == "ViltForImagesAndTextClassification":
-                # hidden_states are a list of length num_images
-                # each element contains the hidden states of a particular image index
-                self.assertEqual(len(hidden_states), self.model_tester.num_images)
-                self.assertEqual(len(hidden_states[0]), expected_num_layers)
-            else:
-                self.assertEqual(len(hidden_states), expected_num_layers)
-
-            seq_length = self.model_tester.expected_seq_len
-
-            if model_class.__name__ == "ViltForImagesAndTextClassification":
-                self.assertListEqual(
-                    list(hidden_states[0][0].shape[-2:]),
-                    [seq_length, self.model_tester.hidden_size],
-                )
-            else:
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            print("Model class:", model_class)
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    @unittest.skip("MindSpore has no .grad")
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        inputs = self._prepare_for_class(inputs_dict, model_class)
-
-        outputs = model(**inputs)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        if model_class.__name__ == "ViltForImagesAndTextClassification":
-            # hidden_states are a list of length num_images
-            # each element contains the hidden states of a particular image index
-            hidden_states[0].retain_grad()
-            attentions[0].retain_grad()
-        else:
-            hidden_states.retain_grad()
-            attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        if model_class.__name__ == "ViltForImagesAndTextClassification":
-            # hidden_states are a list of length num_images
-            # each element contains the hidden states of a particular image index
-            self.assertIsNotNone(hidden_states[0].grad)
-            self.assertIsNotNone(attentions[0].grad)
-        else:
-            self.assertIsNotNone(hidden_states.grad)
-            self.assertIsNotNone(attentions.grad)
-
-    #@slow
-    def test_model_from_pretrained(self):
-        model_name = "dandelin/vilt-b32-mlm"
-        model = ViltModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class ViltForImagesAndTextClassificationModelTest(ViltModelTest, unittest.TestCase):
-    all_model_classes = (ViltForImagesAndTextClassification,) if is_mindspore_available() else ()
-
-    def setUp(self):
-        self.model_tester = ViltModelTester(self, modality_type_vocab_size=3, add_multiple_images=True, num_images=2)
-        self.config_tester = ConfigTester(self, config_class=ViltConfig, hidden_size=37)
-
-    @unittest.skip("We only test the model that takes in multiple images")
-    def test_model(self):
-        pass
-
-    @unittest.skip("We only test the model that takes in multiple images")
-    def test_for_token_classification(self):
-        pass
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-
-    return image
-
-def hex_to_image(hex_str):
-    image_bytes = bytes.fromhex(hex_str)
-    return Image.open(io.BytesIO(image_bytes)).convert("RGB")
-
-@require_mindspore
-@require_vision
-class ViltModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_processor(self):
-        return ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") if is_vision_available() else None
-
-    @slow
-    def test_inference_masked_lm(self):
-        model = ViltForMaskedLM.from_pretrained("dandelin/vilt-b32-mlm")
-
-        processor = self.default_processor
-        image = prepare_img()
-        text = "a bunch of [MASK] laying on a [MASK]."
-        inputs = processor(image, text, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 11, 30522)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-12.5061, -12.5123, -12.5174])
-        logits_slice_np = outputs.logits.asnumpy()[0, 0, :3]
-        expected_slice_np = expected_slice.asnumpy()
-        self.assertTrue(np.allclose(logits_slice_np, expected_slice_np, atol=1e-3))
-
-        # verify masked token prediction equals "cats"
-        predicted_id = outputs.logits[0, 4, :].argmax(-1).item()
-        assert processor.decode([predicted_id]) == "cats"
-
-    @slow
-    def test_inference_visual_question_answering(self):
-        model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
-
-        processor = self.default_processor
-        image = prepare_img()
-        text = "How many cats are there?"
-        inputs = processor(image, text, return_tensors="ms")
-
-        # forward pass
-        model.set_train(False)
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 3129)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-15.9495, -18.1472, -10.3041])
-        logits_np = outputs.logits[0, :3].asnumpy()
-        expected_slice_np = expected_slice.asnumpy()
-
-        self.assertTrue(np.allclose(logits_np, expected_slice_np, atol=1e-4))
-
-        # compute loss
-        vqa_labels = [[2, 3, 155, 800]]
-        vqa_scores = [[1.0, 0.3, 0.3, 0.3]]
-        labels = ops.zeros(1, model.config.num_labels)
-
-        for i, (labels_example, scores_example) in enumerate(zip(vqa_labels, vqa_scores)):
-            for l, s in zip(labels_example, scores_example):
-                labels[i, l] = s
-
-        # forward pass
-        outputs = model(**inputs, labels=labels)
-
-        # verify we have a positive loss
-        self.assertTrue(outputs.loss > 0)
-
-    @slow
-    def test_inference_natural_language_visual_reasoning(self):
-        model = ViltForImagesAndTextClassification.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")
-
-        processor = self.default_processor
-
-        dataset = load_dataset("hf-internal-testing/fixtures_nlvr2", split="test", trust_remote_code=True)
-        image1 = Image.open(dataset.source.ds[0]["file"]).convert("RGB")
-        image2 = Image.open(dataset.source.ds[1]["file"]).convert("RGB")
-
-        text = (
-            "The left image contains twice the number of dogs as the right image, and at least two dogs in total are"
-            " standing."
-        )
-        encoding_1 = processor(image1, text, return_tensors="ms")
-        encoding_2 = processor(image2, text, return_tensors="ms")
-
-        pixel_values = ops.stack([encoding_1.pixel_values, encoding_2.pixel_values], axis=1)
-
-        # forward pass
-        outputs = model(
-            input_ids=encoding_1.input_ids,
-            pixel_values=pixel_values,
-        )
-
-        # verify the logits
-        expected_shape = (1, 2)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        is_pillow_less_than_9 = version.parse(PIL.__version__) < version.parse("9.0.0")
-
-        if is_pillow_less_than_9:
-            expected_slice = mindspore.tensor(
-                [-2.4013, 2.9342],
-            )
-        else:
-            expected_slice = mindspore.tensor(
-                [-2.3713, 2.9168],
-            )
-        logits_np = outputs.logits[0, :3].asnumpy()
-        expected_slice_np =expected_slice.asnumpy()
-
-        self.assertTrue(np.allclose(logits_np, expected_slice_np, atol=1e-3))
diff --git a/tests/transformers/models/vipllava/__init__.py b/tests/transformers/models/vipllava/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/vipllava/test_modeling_vipllava.py b/tests/transformers/models/vipllava/test_modeling_vipllava.py
deleted file mode 100644
index cd0d2dbc0..000000000
--- a/tests/transformers/models/vipllava/test_modeling_vipllava.py
+++ /dev/null
@@ -1,456 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================
-""" Testing suite for the MindSpore VipLlava model. """
-
-import copy
-import gc
-import unittest
-
-import requests
-
-from mindnlp.transformers import (
-    AutoProcessor,
-    VipLlavaConfig,
-    VipLlavaForConditionalGeneration,
-)
-from mindnlp.utils import is_mindspore_available, is_vision_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindnlp.core import ops, no_grad
-
-if is_vision_available():
-    from PIL import Image
-
-
-# Copied from transformers.tests.models.llava.test_modeling_llava.LlavaVisionText2TextModelTester with Llava->VipLlava
-class VipLlavaVisionText2TextModelTester:
-    # Ignore copy
-    def __init__(
-        self,
-        parent,
-        ignore_index=-100,
-        image_token_index=0,
-        projector_hidden_act="gelu",
-        seq_length=7,
-        vision_feature_layers=[0, 0, 1, 1, 0],
-        text_config={
-            "model_type": "llama",
-            "seq_length": 7,
-            "is_training": True,
-            "use_input_mask": True,
-            "use_token_type_ids": False,
-            "use_labels": True,
-            "vocab_size": 99,
-            "hidden_size": 32,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "intermediate_size": 37,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "attention_probs_dropout_prob": 0.1,
-            "max_position_embeddings": 512,
-            "type_vocab_size": 16,
-            "type_sequence_label_size": 2,
-            "initializer_range": 0.02,
-            "num_labels": 3,
-            "num_choices": 4,
-            "pad_token_id": 0,
-        },
-        is_training=True,
-        vision_config={
-            "batch_size": 12,
-            "image_size": 30,
-            "patch_size": 2,
-            "num_channels": 3,
-            "is_training": True,
-            "hidden_size": 32,
-            "projection_dim": 32,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "intermediate_size": 37,
-            "dropout": 0.1,
-            "attention_dropout": 0.1,
-            "initializer_range": 0.02,
-        },
-    ):
-        self.parent = parent
-        self.ignore_index = ignore_index
-        self.image_token_index = image_token_index
-        self.projector_hidden_act = projector_hidden_act
-        self.vision_feature_layers = vision_feature_layers
-        self.text_config = text_config
-        self.vision_config = vision_config
-        self.seq_length = seq_length
-
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.vocab_size = text_config["vocab_size"]
-        self.hidden_size = text_config["hidden_size"]
-        self.num_attention_heads = text_config["num_attention_heads"]
-        self.is_training = is_training
-
-        self.batch_size = 3
-        self.num_channels = 3
-        self.image_size = 336
-        self.encoder_seq_length = 231
-
-    def get_config(self):
-        return VipLlavaConfig(
-            text_config=self.text_config,
-            vision_config=self.vision_config,
-            ignore_index=self.ignore_index,
-            image_token_index=self.image_token_index,
-            projector_hidden_act=self.projector_hidden_act,
-            vision_feature_layers=self.vision_feature_layers,
-        )
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [
-                self.batch_size,
-                self.vision_config["num_channels"],
-                self.vision_config["image_size"],
-                self.vision_config["image_size"],
-            ]
-        )
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        input_ids = ids_tensor(
-            [self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
-        attention_mask = input_ids.ne(1)
-        # we are giving 3 images let's make sure we pass in 3 image tokens
-        input_ids[:, 1] = config.image_token_index
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-# Copied from transformers.tests.models.llava.test_modeling_llava.LlavaForConditionalGenerationModelTest with Llava->VipLlava
-class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Model tester for `VipLlavaForConditionalGeneration`.
-    """
-
-    all_model_classes = (VipLlavaForConditionalGeneration,
-                         ) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = True
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = VipLlavaVisionText2TextModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=VipLlavaConfig, has_text_modality=False)
-
-    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            wte = model.get_input_embeddings()
-            inputs["inputs_embeds"] = wte(input_ids)
-
-            with no_grad():
-                model(**inputs)
-
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_resize_tokens_embeddings with config.vocab_size->config.text_config.vocab_size
-    def test_resize_tokens_embeddings(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            return
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            if self.model_tester.is_training is False:
-                model.eval()
-
-            model_vocab_size = config.text_config.vocab_size
-            # Retrieve the embeddings and clone theme
-            model_embed = model.resize_token_embeddings(model_vocab_size)
-            cloned_embeddings = model_embed.weight.clone()
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(
-                model.config.text_config.vocab_size, model_vocab_size + 10)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(
-                model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(
-                model.config.text_config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(
-                model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["input_ids"] = inputs_dict["input_ids"].clamp(max=model_vocab_size - 15 - 1)
-
-            # make sure that decoder_input_ids are resized as well
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = inputs_dict["decoder_input_ids"].clamp(
-                    max=model_vocab_size - 15 - 1)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                if p1.ne(p2).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            model_vocab_size = config.text_config.vocab_size
-            model.resize_token_embeddings(
-                model_vocab_size + 10, pad_to_multiple_of=1)
-            self.assertTrue(
-                model.config.text_config.vocab_size + 10, model_vocab_size)
-
-            model_embed = model.resize_token_embeddings(
-                model_vocab_size, pad_to_multiple_of=64)
-            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
-
-            self.assertTrue(
-                model_embed.weight.shape[0], model.config.text_config.vocab_size)
-            self.assertTrue(
-                model.config.text_config.vocab_size, model.vocab_size)
-
-            model_embed = model.resize_token_embeddings(
-                model_vocab_size + 13, pad_to_multiple_of=64)
-            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
-
-            # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
-            target_dimension = 128
-            model_embed = model.resize_token_embeddings(
-                target_dimension, pad_to_multiple_of=64)
-            self.assertTrue(model_embed.weight.shape[0], target_dimension)
-
-            with self.assertRaisesRegex(
-                ValueError,
-                "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
-            ):
-                model.resize_token_embeddings(
-                    model_vocab_size, pad_to_multiple_of=1.3)
-
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_resize_embeddings_untied with config.vocab_size->config.text_config.vocab_size
-    def test_resize_embeddings_untied(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            return
-
-        original_config.tie_word_embeddings = False
-
-        # if model cannot untied embeddings -> leave test
-        if original_config.tie_word_embeddings:
-            return
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            # if no output embeddings -> leave test
-            if model.get_output_embeddings() is None:
-                continue
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_vocab_size = config.text_config.vocab_size
-            model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(
-                model.config.text_config.vocab_size, model_vocab_size + 10)
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(
-                output_embeds.weight.shape[0], model_vocab_size + 10)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(
-                    output_embeds.bias.shape[0], model_vocab_size + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(
-                model.config.text_config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(
-                output_embeds.weight.shape[0], model_vocab_size - 15)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(
-                    output_embeds.bias.shape[0], model_vocab_size - 15)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["input_ids"] = inputs_dict["input_ids"].clamp(max=model_vocab_size - 15 - 1)
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = inputs_dict["decoder_input_ids"].clamp(
-                    max=model_vocab_size - 15 - 1)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_tie_model_weights with config.vocab_size->config.text_config.vocab_size
-    def test_tie_model_weights(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_same_values(layer_1, layer_2):
-            equal = True
-            for p1, p2 in zip(layer_1.weight, layer_2.weight):
-                if p1.data.ne(p2.data).sum() > 0:
-                    equal = False
-            return equal
-
-        for model_class in self.all_model_classes:
-            config.torchscript = True
-            model_not_tied = model_class(config)
-            if model_not_tied.get_output_embeddings() is None:
-                continue
-
-            config_tied = copy.deepcopy(config)
-            config_tied.torchscript = False
-            model_tied = model_class(config_tied)
-            params_tied = list(model_tied.get_parameters())
-            # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # Check that after resize they remain tied.
-            model_tied.resize_token_embeddings(
-                config.text_config.vocab_size + 10)
-            params_tied_2 = list(model_tied.get_parameters())
-            self.assertEqual(len(params_tied_2), len(params_tied))
-
-
-@require_mindspore
-class VipLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
-    def setUp(self):
-        self.processor = AutoProcessor.from_pretrained(
-            "llava-hf/vip-llava-7b-hf")
-
-    def tearDown(self):
-        gc.collect()
-
-    @slow
-    def test_small_model_integration_test(self):
-        model_id = "llava-hf/vip-llava-7b-hf"
-
-        model = VipLlavaForConditionalGeneration.from_pretrained(
-            model_id, load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
-
-        image = Image.open(requests.get(url, stream=True).raw)
-        prompt = "USER: <image>\nCan you please describe this image?\nASSISTANT:"
-
-        inputs = processor(
-            prompt, image, return_tensors="ms").astype(ms.float16)
-
-        outputs = model.generate(**inputs, max_new_tokens=10)
-
-        EXPECTED_OUTPUT = "USER: <image> \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on"
-        self.assertEqual(processor.decode(
-            outputs[0], skip_special_tokens=True), EXPECTED_OUTPUT)
-
-    @slow
-    def test_vipllava_merge_inputs_error_bug(self):
-        # This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
-        model_id = "llava-hf/vip-llava-7b-hf"
-        model = VipLlavaForConditionalGeneration.from_pretrained(
-            model_id
-        )
-
-        # Simulate some user inputs
-        pixel_values = ops.randn(
-            (2, 3, 336, 336),
-            dtype=ms.float32,
-        )
-        input_ids = ms.Tensor(
-            [
-                [32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
-                [1, 15043, 7084, 29901, 29871, 32000, 29871, 13, 7900],
-            ],
-            dtype=ms.int64,
-        )
-        attention_mask = ms.Tensor(
-            [[0, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]],
-            dtype=ms.int64,
-        )
-
-        # Make sure that the loss is properly computed
-        loss = model(
-            pixel_values=pixel_values,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            labels=input_ids,
-        ).loss
-        loss.backward()
diff --git a/tests/transformers/models/vision_encoder_decoder/__init__.py b/tests/transformers/models/vision_encoder_decoder/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/transformers/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
deleted file mode 100644
index 65cca859b..000000000
--- a/tests/transformers/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ /dev/null
@@ -1,1178 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-import tempfile
-import unittest
-
-from huggingface_hub import hf_hub_download
-from packaging import version
-
-from mindnlp.transformers import DonutProcessor, NougatProcessor, TrOCRProcessor
-from mindnlp.utils.testing_utils import (
-    require_levenshtein,
-    require_nltk,
-    require_sentencepiece,
-    require_mindspore,
-    require_vision,
-    slow,
-    to_2tuple,
-)
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-from datasets import load_dataset
-
-from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
-from ..bart.test_modeling_bart import BartModelTester
-from ..bert.test_modeling_bert import BertModelTester
-from ..deit.test_modeling_deit import DeiTModelTester
-from ..layoutlmv3.test_modeling_layoutlmv3 import LayoutLMv3ModelTester
-from ..swin.test_modeling_swin import SwinModelTester
-from ..trocr.test_modeling_trocr import TrOCRStandaloneDecoderModelTester
-from ..vit.test_modeling_vit import ViTModelTester
-
-
-if is_mindspore_available():
-    import numpy as np
-    import mindspore
-    from mindnlp.core import ops, nn, no_grad
-
-    from mindnlp.transformers import (
-        AutoTokenizer,
-        BartForCausalLM,
-        BertLMHeadModel,
-        DeiTModel,
-        LayoutLMv3Model,
-        SwinModel,
-        TrOCRForCausalLM,
-        VisionEncoderDecoderConfig,
-        VisionEncoderDecoderModel,
-        ViTModel,
-    )
-    from mindnlp.transformers.modeling_outputs import BaseModelOutput
-
-
-if is_vision_available():
-    import PIL
-    from PIL import Image
-
-    from mindnlp.transformers import ViTImageProcessor
-
-
-@require_mindspore
-class EncoderDecoderMixin:
-    def get_encoder_decoder_model(self, config, decoder_config):
-        pass
-
-    def prepare_config_and_inputs(self):
-        pass
-
-    def get_pretrained_model_and_inputs(self):
-        pass
-
-    def check_encoder_decoder_model_from_pretrained_configs(
-        self, config, decoder_config, decoder_input_ids, decoder_attention_mask, pixel_values=None, **kwargs
-    ):
-        encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-        self.assertTrue(encoder_decoder_config.decoder.is_decoder)
-
-        enc_dec_model = VisionEncoderDecoderModel(encoder_decoder_config)
-        enc_dec_model.eval()
-
-        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
-
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-
-    def check_encoder_decoder_model(
-        self, config, decoder_config, decoder_input_ids, decoder_attention_mask, pixel_values=None, **kwargs
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        self.assertTrue(enc_dec_model.config.decoder.is_decoder)
-        self.assertTrue(enc_dec_model.config.decoder.add_cross_attention)
-        self.assertTrue(enc_dec_model.config.is_encoder_decoder)
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            output_hidden_states=True,
-        )
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-        encoder_outputs = BaseModelOutput(last_hidden_state=outputs_encoder_decoder.encoder_hidden_states[-1])
-        outputs_encoder_decoder = enc_dec_model(
-            encoder_outputs=encoder_outputs,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-
-    def check_encoder_decoder_model_from_pretrained(
-        self,
-        config,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        return_dict,
-        pixel_values=None,
-        **kwargs,
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
-        enc_dec_model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(**kwargs)
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            output_hidden_states=True,
-            return_dict=True,
-        )
-
-        self.assertEqual(
-            outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
-        )
-
-    def check_save_and_load(
-        self, config, decoder_config, decoder_input_ids, decoder_attention_mask, pixel_values=None, **kwargs
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        enc_dec_model.eval()
-        with no_grad():
-            outputs = enc_dec_model(
-                pixel_values=pixel_values,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-            out_2 = outputs[0].asnumpy()
-            out_2[np.isnan(out_2)] = 0
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                enc_dec_model.save_pretrained(tmpdirname)
-                enc_dec_model = VisionEncoderDecoderModel.from_pretrained(tmpdirname)
-
-                after_outputs = enc_dec_model(
-                    pixel_values=pixel_values,
-                    decoder_input_ids=decoder_input_ids,
-                    decoder_attention_mask=decoder_attention_mask,
-                )
-                out_1 = after_outputs[0].asnumpy()
-                out_1[np.isnan(out_1)] = 0
-                max_diff = np.amax(np.abs(out_1 - out_2))
-                self.assertLessEqual(max_diff, 1e-5)
-
-    def check_save_and_load_encoder_decoder_model(
-        self, config, decoder_config, decoder_input_ids, decoder_attention_mask, pixel_values=None, **kwargs
-    ):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        enc_dec_model.eval()
-        with no_grad():
-            outputs = enc_dec_model(
-                pixel_values=pixel_values,
-                decoder_input_ids=decoder_input_ids,
-                decoder_attention_mask=decoder_attention_mask,
-            )
-            out_2 = outputs[0].asnumpy()
-            out_2[np.isnan(out_2)] = 0
-
-            with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:
-                enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname)
-                enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname)
-                VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-                    encoder_pretrained_model_name_or_path=encoder_tmp_dirname,
-                    decoder_pretrained_model_name_or_path=decoder_tmp_dirname,
-                )
-
-                after_outputs = enc_dec_model(
-                    pixel_values=pixel_values,
-                    decoder_input_ids=decoder_input_ids,
-                    decoder_attention_mask=decoder_attention_mask,
-                )
-                out_1 = after_outputs[0].asnumpy()
-                out_1[np.isnan(out_1)] = 0
-                max_diff = np.amax(np.abs(out_1 - out_2))
-                self.assertLessEqual(max_diff, 1e-5)
-
-    def check_encoder_decoder_model_output_attentions(
-        self,
-        config,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        labels=None,
-        pixel_values=None,
-        **kwargs,
-    ):
-        # make the decoder inputs a different shape from the encoder inputs to harden the test
-        decoder_input_ids = decoder_input_ids[:, :-1]
-        decoder_attention_mask = decoder_attention_mask[:, :-1]
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            output_attentions=True,
-        )
-
-        encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
-        self.assertEqual(len(encoder_attentions), config.num_hidden_layers)
-
-        # in ViT, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
-        image_size = to_2tuple(encoder_model.config.image_size)
-        patch_size = to_2tuple(encoder_model.config.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        seq_len = num_patches + 1
-        self.assertEqual(encoder_attentions[0].shape[-3:], (config.num_attention_heads, seq_len, seq_len))
-
-        decoder_attentions = outputs_encoder_decoder["decoder_attentions"]
-        num_decoder_layers = (
-            decoder_config.num_decoder_layers
-            if hasattr(decoder_config, "num_decoder_layers")
-            else decoder_config.num_hidden_layers
-        )
-        self.assertEqual(len(decoder_attentions), num_decoder_layers)
-
-        self.assertEqual(
-            decoder_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
-        )
-
-        cross_attentions = outputs_encoder_decoder["cross_attentions"]
-        self.assertEqual(len(cross_attentions), num_decoder_layers)
-
-        cross_attention_input_seq_len = decoder_input_ids.shape[-1]
-        self.assertEqual(
-            cross_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, cross_attention_input_seq_len, seq_len),
-        )
-
-    def check_encoder_decoder_model_generate(self, config, decoder_config, pixel_values=None, **kwargs):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-
-        # Generate until max length
-        if hasattr(enc_dec_model.config, "eos_token_id"):
-            enc_dec_model.config.eos_token_id = None
-        if hasattr(enc_dec_model.config, "decoder") and hasattr(enc_dec_model.config.decoder, "eos_token_id"):
-            enc_dec_model.config.decoder.eos_token_id = None
-        if hasattr(enc_dec_model.generation_config, "eos_token_id"):
-            enc_dec_model.generation_config.eos_token_id = None
-
-        inputs = pixel_values
-
-        # Bert does not have a bos token id, so use pad_token_id instead
-        generated_output = enc_dec_model.generate(
-            inputs, decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id
-        )
-        self.assertEqual(generated_output.shape, (inputs.shape[0],) + (decoder_config.max_length,))
-
-    def test_encoder_decoder_model(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model(**input_ids_dict)
-
-    def test_encoder_decoder_model_from_pretrained_configs(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained_configs(**input_ids_dict)
-
-    def test_encoder_decoder_model_from_pretrained(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=False)
-
-    def test_encoder_decoder_model_from_pretrained_return_dict(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_from_pretrained(**input_ids_dict, return_dict=True)
-
-    def test_save_and_load_from_pretrained(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_save_and_load(**input_ids_dict)
-
-    def test_save_and_load_from_encoder_decoder_pretrained(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_save_and_load_encoder_decoder_model(**input_ids_dict)
-
-    def test_encoder_decoder_model_output_attentions(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_output_attentions(**input_ids_dict)
-
-    def test_encoder_decoder_model_generate(self):
-        input_ids_dict = self.prepare_config_and_inputs()
-        self.check_encoder_decoder_model_generate(**input_ids_dict)
-
-    @unittest.skip
-    def test_training_gradient_checkpointing(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        encoder_model, decoder_model = self.get_encoder_decoder_model(
-            inputs_dict["config"], inputs_dict["decoder_config"]
-        )
-
-        model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        model.train()
-        model.gradient_checkpointing_enable()
-        model.config.decoder_start_token_id = 0
-        model.config.pad_token_id = 0
-
-        model_inputs = {
-            "pixel_values": inputs_dict["pixel_values"],
-            "labels": inputs_dict["labels"],
-            "decoder_input_ids": inputs_dict["decoder_input_ids"],
-        }
-
-        loss = model(**model_inputs).loss
-        loss.backward()
-
-    @slow
-    def test_real_model_save_load_from_pretrained(self):
-        model_2, inputs = self.get_pretrained_model_and_inputs()
-
-        with no_grad():
-            outputs = model_2(**inputs)
-            out_2 = outputs[0].asnumpy()
-            out_2[np.isnan(out_2)] = 0
-
-            with tempfile.TemporaryDirectory() as tmp_dirname:
-                model_2.save_pretrained(tmp_dirname)
-                model_1 = VisionEncoderDecoderModel.from_pretrained(tmp_dirname)
-
-                after_outputs = model_1(**inputs)
-                out_1 = after_outputs[0].asnumpy()
-                out_1[np.isnan(out_1)] = 0
-                max_diff = np.amax(np.abs(out_1 - out_2))
-                self.assertLessEqual(max_diff, 1e-5)
-
-
-@require_mindspore
-class DeiT2RobertaModelTest(EncoderDecoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "hf-internal-testing/tiny-random-deit", "hf-internal-testing/tiny-random-roberta"
-        )
-        batch_size = 13
-        pixel_values = floats_tensor(
-            [
-                batch_size,
-                model.encoder.config.num_channels,
-                model.encoder.config.image_size,
-                model.encoder.config.image_size,
-            ]
-        )
-        # for DEiT, the sequence length is equal to the number of patches + 2 (for the [CLS] and distillation tokens)
-        decoder_input_ids = ids_tensor([batch_size, 4], model.decoder.config.vocab_size)
-        decoder_attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {
-            "pixel_values": pixel_values,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-        return model, inputs
-
-    def check_encoder_decoder_model_output_attentions(
-        self,
-        config,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        labels=None,
-        pixel_values=None,
-        **kwargs,
-    ):
-        # make the decoder inputs a different shape from the encoder inputs to harden the test
-        decoder_input_ids = decoder_input_ids[:, :-1]
-        decoder_attention_mask = decoder_attention_mask[:, :-1]
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            output_attentions=True,
-        )
-
-        encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
-        self.assertEqual(len(encoder_attentions), config.num_hidden_layers)
-
-        # in DEiT, the seq_len equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
-        image_size = to_2tuple(encoder_model.config.image_size)
-        patch_size = to_2tuple(encoder_model.config.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        seq_len = num_patches + 2
-        self.assertEqual(encoder_attentions[0].shape[-3:], (config.num_attention_heads, seq_len, seq_len))
-
-        decoder_attentions = outputs_encoder_decoder["decoder_attentions"]
-        num_decoder_layers = (
-            decoder_config.num_decoder_layers
-            if hasattr(decoder_config, "num_decoder_layers")
-            else decoder_config.num_hidden_layers
-        )
-        self.assertEqual(len(decoder_attentions), num_decoder_layers)
-
-        self.assertEqual(
-            decoder_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
-        )
-
-        cross_attentions = outputs_encoder_decoder["cross_attentions"]
-        self.assertEqual(len(cross_attentions), num_decoder_layers)
-
-        cross_attention_input_seq_len = decoder_input_ids.shape[-1]
-        self.assertEqual(
-            cross_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, cross_attention_input_seq_len, seq_len),
-        )
-
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = DeiTModel(config).eval()
-        decoder_model = BertLMHeadModel(decoder_config).eval()
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        bert_model_tester = BertModelTester(self)
-        deit_model_tester = DeiTModelTester(self)
-        encoder_config_and_inputs = deit_model_tester.prepare_config_and_inputs()
-        decoder_config_and_inputs = bert_model_tester.prepare_config_and_inputs_for_decoder()
-        config, pixel_values, _ = encoder_config_and_inputs
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_token_type_ids,
-            decoder_input_mask,
-            decoder_sequence_labels,
-            decoder_token_labels,
-            decoder_choice_labels,
-            encoder_attention_mask,
-            _,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        return {
-            "config": config,
-            "pixel_values": pixel_values,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_token_type_ids": decoder_token_type_ids,
-            "decoder_attention_mask": decoder_input_mask,
-            "decoder_sequence_labels": decoder_sequence_labels,
-            "decoder_token_labels": decoder_token_labels,
-            "decoder_choice_labels": decoder_choice_labels,
-            "labels": decoder_token_labels,
-        }
-
-
-@require_mindspore
-class ViT2BertModelTest(EncoderDecoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "hf-internal-testing/tiny-random-vit", "hf-internal-testing/tiny-bert"
-        )
-        batch_size = 13
-        pixel_values = floats_tensor(
-            [
-                batch_size,
-                model.encoder.config.num_channels,
-                model.encoder.config.image_size,
-                model.encoder.config.image_size,
-            ]
-        )
-        # for ViT, the sequence length is equal to the number of patches + 1 (for the [CLS] token)
-        decoder_input_ids = ids_tensor([batch_size, 4], model.decoder.config.vocab_size)
-        decoder_attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {
-            "pixel_values": pixel_values,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-        return model, inputs
-
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = ViTModel(config).eval()
-        decoder_model = BertLMHeadModel(decoder_config).eval()
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        vit_model_tester = ViTModelTester(self)
-        bert_model_tester = BertModelTester(self)
-        encoder_config_and_inputs = vit_model_tester.prepare_config_and_inputs()
-        decoder_config_and_inputs = bert_model_tester.prepare_config_and_inputs_for_decoder()
-
-        config, pixel_values, _ = encoder_config_and_inputs
-
-        (
-            decoder_config,
-            decoder_input_ids,
-            decoder_token_type_ids,
-            decoder_input_mask,
-            decoder_sequence_labels,
-            decoder_token_labels,
-            decoder_choice_labels,
-            encoder_attention_mask,
-            _,
-        ) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        return {
-            "config": config,
-            "pixel_values": pixel_values,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_token_type_ids": decoder_token_type_ids,
-            "decoder_attention_mask": decoder_input_mask,
-            "decoder_sequence_labels": decoder_sequence_labels,
-            "decoder_token_labels": decoder_token_labels,
-            "decoder_choice_labels": decoder_choice_labels,
-            "labels": decoder_token_labels,
-        }
-
-
-@require_mindspore
-class Swin2BartModelTest(EncoderDecoderMixin, unittest.TestCase):
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = SwinModel(config).eval()
-        decoder_model = BartForCausalLM(decoder_config).eval()
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        model_tester_encoder = SwinModelTester(self, batch_size=13, embed_dim=32)
-        model_tester_decoder = BartModelTester(self, batch_size=13, hidden_size=32, max_position_embeddings=512)
-        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
-        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs()
-        config, pixel_values, _ = encoder_config_and_inputs
-        decoder_config, decoder_inputs_dict = decoder_config_and_inputs
-        decoder_inputs_dict["labels"] = decoder_inputs_dict["decoder_input_ids"]
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        #  disable cache for now
-        decoder_config.use_cache = False
-        return {
-            "config": config,
-            "pixel_values": pixel_values,
-            "decoder_config": decoder_config,
-            **decoder_inputs_dict,
-        }
-
-    def check_encoder_decoder_model_output_attentions(
-        self,
-        config,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        labels=None,
-        pixel_values=None,
-        **kwargs,
-    ):
-        # make the decoder inputs a different shape from the encoder inputs to harden the test
-        decoder_input_ids = decoder_input_ids[:, :-1]
-        decoder_attention_mask = decoder_attention_mask[:, :-1]
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        outputs_encoder_decoder = enc_dec_model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            output_attentions=True,
-        )
-
-        encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
-        self.assertEqual(len(encoder_attentions), config.num_hidden_layers)
-
-        # in Swin, the seq_len equals:
-        seq_len = encoder_model.config.window_size**2
-        self.assertEqual(encoder_attentions[0].shape[-3:], (config.num_attention_heads[0], seq_len, seq_len))
-
-        decoder_attentions = outputs_encoder_decoder["decoder_attentions"]
-        num_decoder_layers = (
-            decoder_config.num_decoder_layers
-            if hasattr(decoder_config, "num_decoder_layers")
-            else decoder_config.num_hidden_layers
-        )
-        self.assertEqual(len(decoder_attentions), num_decoder_layers)
-
-        self.assertEqual(
-            decoder_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
-        )
-
-        cross_attentions = outputs_encoder_decoder["cross_attentions"]
-        self.assertEqual(len(cross_attentions), num_decoder_layers)
-
-        encoder_seq_len = ((config.image_size // config.patch_size) ** 2) // (4 ** (len(config.depths) - 1))
-        cross_attention_input_seq_len = decoder_input_ids.shape[-1]
-        self.assertEqual(
-            cross_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, cross_attention_input_seq_len, encoder_seq_len),
-        )
-
-    @unittest.skip(reason="There are no published pretrained BART-causal checkpoints for now")
-    def test_real_model_save_load_from_pretrained(self):
-        pass
-
-
-@require_mindspore
-class ViT2TrOCR(EncoderDecoderMixin, unittest.TestCase):
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = ViTModel(config).eval()
-        decoder_model = TrOCRForCausalLM(decoder_config).eval()
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        model_tester_encoder = ViTModelTester(self, batch_size=13)
-        model_tester_decoder = TrOCRStandaloneDecoderModelTester(
-            self, batch_size=13, d_model=32, max_position_embeddings=512
-        )
-        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
-        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs()
-        config, pixel_values, _ = encoder_config_and_inputs
-        (decoder_config, decoder_input_ids, decoder_attention_mask, _) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        #  disable cache for now
-        decoder_config.use_cache = False
-        return {
-            "config": config,
-            "pixel_values": pixel_values,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "labels": decoder_input_ids,
-        }
-
-    @unittest.skip(reason="There are no published pretrained TrOCR checkpoints for now")
-    def test_real_model_save_load_from_pretrained(self):
-        pass
-
-
-@require_mindspore
-class LayoutLMv32TrOCR(EncoderDecoderMixin, unittest.TestCase):
-    def get_encoder_decoder_model(self, config, decoder_config):
-        encoder_model = LayoutLMv3Model(config).eval()
-        decoder_model = TrOCRForCausalLM(decoder_config).eval()
-        return encoder_model, decoder_model
-
-    def prepare_config_and_inputs(self):
-        model_tester_encoder = LayoutLMv3ModelTester(self, batch_size=13, image_size=4, patch_size=2)
-        model_tester_decoder = TrOCRStandaloneDecoderModelTester(
-            self, batch_size=13, d_model=32, max_position_embeddings=512
-        )
-        encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
-        decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            bbox,
-            pixel_values,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-        ) = encoder_config_and_inputs
-        (decoder_config, decoder_input_ids, decoder_attention_mask, _) = decoder_config_and_inputs
-
-        # make sure that cross attention layers are added
-        decoder_config.add_cross_attention = True
-        #  disable cache for now
-        decoder_config.use_cache = False
-        return {
-            "config": config,
-            "pixel_values": pixel_values,
-            "input_ids": input_ids,
-            "bbox": bbox,
-            "decoder_config": decoder_config,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-            "labels": decoder_input_ids,
-        }
-
-    def check_encoder_decoder_model_output_attentions(
-        self,
-        config,
-        decoder_config,
-        decoder_input_ids,
-        decoder_attention_mask,
-        input_ids,
-        pixel_values,
-        labels=None,
-        **kwargs,
-    ):
-        # make the decoder inputs a different shape from the encoder inputs to harden the test
-        decoder_input_ids = decoder_input_ids[:, :-1]
-        decoder_attention_mask = decoder_attention_mask[:, :-1]
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-        outputs_encoder_decoder = enc_dec_model(
-            input_ids=input_ids,
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            output_attentions=True,
-            **kwargs,
-        )
-
-        encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
-        self.assertEqual(len(encoder_attentions), config.num_hidden_layers)
-
-        # LayoutLMv3's sequence length equals the number of text tokens + number of patches + 1 (we add 1 for the CLS token)
-        text_seq_length = input_ids.shape[-1]
-        image_seq_length = (encoder_model.config.input_size // encoder_model.config.patch_size) ** 2 + 1
-        seq_len = text_seq_length + image_seq_length
-
-        decoder_attentions = outputs_encoder_decoder["decoder_attentions"]
-        num_decoder_layers = (
-            decoder_config.num_decoder_layers
-            if hasattr(decoder_config, "num_decoder_layers")
-            else decoder_config.num_hidden_layers
-        )
-        self.assertEqual(len(decoder_attentions), num_decoder_layers)
-
-        self.assertEqual(
-            decoder_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
-        )
-
-        cross_attentions = outputs_encoder_decoder["cross_attentions"]
-        self.assertEqual(len(cross_attentions), num_decoder_layers)
-
-        cross_attention_input_seq_len = decoder_input_ids.shape[-1]
-        self.assertEqual(
-            cross_attentions[0].shape[-3:],
-            (decoder_config.num_attention_heads, cross_attention_input_seq_len, seq_len),
-        )
-
-    def check_encoder_decoder_model_generate(self, config, decoder_config, pixel_values=None, **kwargs):
-        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
-
-        # Generate until max length
-        if hasattr(enc_dec_model.config, "eos_token_id"):
-            enc_dec_model.config.eos_token_id = None
-        if hasattr(enc_dec_model.config, "decoder") and hasattr(enc_dec_model.config.decoder, "eos_token_id"):
-            enc_dec_model.config.decoder.eos_token_id = None
-        if hasattr(enc_dec_model.generation_config, "eos_token_id"):
-            enc_dec_model.generation_config.eos_token_id = None
-
-        generated_output = enc_dec_model.generate(
-            pixel_values=pixel_values,
-            decoder_start_token_id=enc_dec_model.config.decoder.bos_token_id,
-            **kwargs,
-        )
-        self.assertEqual(generated_output.shape, (pixel_values.shape[0],) + (decoder_config.max_length,))
-
-    @unittest.skip(reason="There are no published pretrained TrOCR checkpoints for now")
-    def test_real_model_save_load_from_pretrained(self):
-        pass
-
-
-@require_vision
-@require_mindspore
-class TrOCRModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_processor(self):
-        return TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") if is_vision_available() else None
-
-    @slow
-    def test_inference_handwritten(self):
-        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
-
-        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test", trust_remote_code=True)
-        image = Image.open(dataset[0]["file"]).convert("RGB")
-
-        processor = self.default_processor
-        pixel_values = processor(images=image, return_tensors="ms").pixel_values
-
-        # forward pass
-        decoder_input_ids = mindspore.tensor([[model.config.decoder.decoder_start_token_id]])
-        outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 1, model.decoder.config.vocab_size)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [-1.4502, -4.6683, -0.5347, -2.9291, 9.1435, -3.0571, 8.9764, 1.7560, 8.7358, -1.5311]
-        )
-
-        self.assertTrue(ops.allclose(logits[0, 0, :10], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_printed(self):
-        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
-
-        dataset = load_dataset("hf-internal-testing/fixtures_ocr", split="test", trust_remote_code=True)
-        image = Image.open(dataset[1]["file"]).convert("RGB")
-
-        processor = self.default_processor
-        pixel_values = processor(images=image, return_tensors="ms").pixel_values
-
-        # forward pass
-        decoder_input_ids = mindspore.tensor([[model.config.decoder.decoder_start_token_id]])
-        outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 1, model.decoder.config.vocab_size)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        is_pillow_less_than_9 = version.parse(PIL.__version__) < version.parse("9.0.0")
-
-        if is_pillow_less_than_9:
-            expected_slice = mindspore.tensor(
-                [-5.6816, -5.8388, 1.1398, -6.9034, 6.8505, -2.4393, 1.2284, -1.0232, -1.9661, -3.9210],
-            )
-        else:
-            expected_slice = mindspore.tensor(
-                [-5.6844, -5.8372, 1.1518, -6.8984, 6.8587, -2.4453, 1.2347, -1.0241, -1.9649, -3.9109],
-            )
-
-        self.assertTrue(ops.allclose(logits[0, 0, :10], expected_slice, atol=1e-4))
-
-
-@require_vision
-@require_mindspore
-class ViT2GPT2ModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_coco_en(self):
-        loc = "ydshieh/vit-gpt2-coco-en"
-
-        image_processor = ViTImageProcessor.from_pretrained(loc)
-        tokenizer = AutoTokenizer.from_pretrained(loc)
-        model = VisionEncoderDecoderModel.from_pretrained(loc)
-        model.eval()
-
-        # We will verify our results on an image of cute cats
-        img = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        pixel_values = image_processor(images=img, return_tensors="ms").pixel_values
-
-        decoder_input_ids = mindspore.tensor([[model.config.decoder_start_token_id]])
-
-        with no_grad():
-            logits = model(pixel_values, decoder_input_ids)[0].asnumpy()
-
-        # verify the logits
-        expected_shape = (1, 1, model.config.decoder.vocab_size)
-        self.assertEqual(logits.shape, expected_shape)
-
-        EXPECTED_LOGIT_SLICE = np.array(
-            [
-                -38.705807,
-                -30.639929,
-                -31.41903,
-                -39.012012,
-                -38.38696,
-                -34.887207,
-                -33.290855,
-                -35.68447,
-                -38.508484,
-                -36.124645,
-            ]
-        )
-        max_diff = np.amax(np.abs(logits[0, 0, :10] - EXPECTED_LOGIT_SLICE))
-        self.assertLessEqual(max_diff, 1e-4)
-
-        def generate_step(pixel_values):
-            outputs = model.generate(
-                pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True, output_scores=True
-            )
-            output_ids = outputs.sequences
-            preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-            preds = [pred.strip() for pred in preds]
-
-            return preds, outputs.sequences_scores.asnumpy()
-
-        preds, scores = generate_step(pixel_values)
-
-        EXPECTED_SCORES = np.array([-0.5956343])
-        max_diff = np.amax(np.abs(scores - EXPECTED_SCORES))
-        self.assertLessEqual(max_diff, 1e-4)
-
-        # should produce
-        # ["a cat laying on top of a couch next to another cat"]
-        self.assertEqual(preds, ["a cat laying on top of a couch next to another cat"])
-
-
-@require_vision
-@require_mindspore
-@require_sentencepiece
-class DonutModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_docvqa(self):
-        processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
-        model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
-        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
-        image = dataset[0]["image"]
-
-        pixel_values = processor(images=image, return_tensors="ms").pixel_values
-        decoder_input_ids = processor.tokenizer(
-            "<s_docvqa>", add_special_tokens=False, return_tensors="ms"
-        ).input_ids
-
-        # step 1: single forward pass
-        with no_grad():
-            outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
-            logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 1, 57532)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([24.3873, -6.4491, 32.5394])
-        self.assertTrue(ops.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))
-
-        # step 2: generation
-        task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
-        question = "When is the coffee break?"
-        prompt = task_prompt.replace("{user_input}", question)
-        decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="ms").input_ids
-        decoder_input_ids = decoder_input_ids
-
-        outputs = model.generate(
-            pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            max_length=model.decoder.config.max_position_embeddings,
-            early_stopping=True,
-            pad_token_id=processor.tokenizer.pad_token_id,
-            eos_token_id=processor.tokenizer.eos_token_id,
-            use_cache=True,
-            num_beams=1,
-            bad_words_ids=[[processor.tokenizer.unk_token_id]],
-            output_scores=True,
-            return_dict_in_generate=True,
-        )
-        sequence = processor.batch_decode(outputs.sequences)[0]
-        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
-        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
-
-        # verify generated sequence
-        self.assertEqual(
-            sequence, "<s_question> When is the coffee break?</s_question><s_answer> 11-14 to 11:39 a.m.</s_answer>"
-        )
-
-        # verify scores
-        self.assertEqual(len(outputs.scores), 11)
-        self.assertTrue(
-            ops.allclose(
-                outputs.scores[0][0, :3], mindspore.tensor([5.6019, -3.5070, 13.7123]), atol=1e-4
-            )
-        )
-
-    @slow
-    def test_inference_cordv2(self):
-        processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
-        model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
-        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
-        image = dataset[2]["image"]
-
-        pixel_values = processor(images=image, return_tensors="ms").pixel_values
-        decoder_input_ids = processor.tokenizer(
-            "<s_cord-v2>", add_special_tokens=False, return_tensors="ms"
-        ).input_ids
-
-        # step 1: single forward pass
-        with no_grad():
-            outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
-            logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 1, model.decoder.config.vocab_size)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-27.4344, -3.2686, -19.3524])
-        self.assertTrue(ops.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))
-
-        # step 2: generation
-        task_prompt = "<s_cord-v2>"
-        decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="ms").input_ids
-        decoder_input_ids = decoder_input_ids
-
-        outputs = model.generate(
-            pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            max_length=model.decoder.config.max_position_embeddings,
-            early_stopping=True,
-            pad_token_id=processor.tokenizer.pad_token_id,
-            eos_token_id=processor.tokenizer.eos_token_id,
-            use_cache=True,
-            num_beams=1,
-            bad_words_ids=[[processor.tokenizer.unk_token_id]],
-            output_scores=True,
-            return_dict_in_generate=True,
-        )
-
-        sequence = processor.batch_decode(outputs.sequences)[0]
-        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
-        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
-
-        # verify generated sequence
-        expected_sequence = "<s_menu><s_nm> CINNAMON SUGAR</s_nm><s_unitprice> 17,000</s_unitprice><s_cnt> 1 x</s_cnt><s_price> 17,000</s_price></s_menu><s_sub_total><s_subtotal_price> 17,000</s_subtotal_price></s_sub_total><s_total><s_total_price> 17,000</s_total_price><s_cashprice> 20,000</s_cashprice><s_changeprice> 3,000</s_changeprice></s_total>"  # noqa: E231  # fmt: skip
-        self.assertEqual(sequence, expected_sequence)
-
-        # verify scores
-        self.assertEqual(len(outputs.scores), 43)
-        self.assertTrue(
-            ops.allclose(
-                outputs.scores[0][0, :3], mindspore.tensor([-27.4344, -3.2686, -19.3524]), atol=1e-4
-            )
-        )
-
-    @slow
-    def test_inference_rvlcdip(self):
-        processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
-        model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
-
-        dataset = load_dataset("hf-internal-testing/example-documents", split="test")
-        image = dataset[1]["image"]
-
-        pixel_values = processor(images=image, return_tensors="ms").pixel_values
-
-        # step 1: single forward pass
-        decoder_input_ids = processor.tokenizer(
-            "<s_rvlcdip>", add_special_tokens=False, return_tensors="ms"
-        ).input_ids
-        with no_grad():
-            outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
-            logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 1, model.decoder.config.vocab_size)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([-17.6490, -4.8381, -15.7577])
-        self.assertTrue(ops.allclose(logits[0, 0, :3], expected_slice, atol=1e-4))
-
-        # step 2: generation
-        task_prompt = "<s_rvlcdip>"
-        decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="ms").input_ids
-        decoder_input_ids = decoder_input_ids
-
-        outputs = model.generate(
-            pixel_values,
-            decoder_input_ids=decoder_input_ids,
-            max_length=model.decoder.config.max_position_embeddings,
-            early_stopping=True,
-            pad_token_id=processor.tokenizer.pad_token_id,
-            eos_token_id=processor.tokenizer.eos_token_id,
-            use_cache=True,
-            num_beams=1,
-            bad_words_ids=[[processor.tokenizer.unk_token_id]],
-            output_scores=True,
-            return_dict_in_generate=True,
-        )
-
-        sequence = processor.batch_decode(outputs.sequences)[0]
-        sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
-        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
-
-        # verify generated sequence
-        self.assertEqual(sequence, "<s_class><advertisement/></s_class>")
-
-        # verify scores
-        self.assertEqual(len(outputs.scores), 4)
-        self.assertTrue(
-            ops.allclose(
-                outputs.scores[0][0, :3], mindspore.tensor([-17.6490, -4.8381, -15.7577]), atol=1e-4
-            )
-        )
-
-
-@require_levenshtein
-@require_nltk
-@require_mindspore
-@require_vision
-@slow
-class NougatModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_processor(self):
-        return NougatProcessor.from_pretrained("facebook/nougat-base") if is_vision_available() else None
-
-    @cached_property
-    def default_model(self):
-        return VisionEncoderDecoderModel.from_pretrained("facebook/nougat-base")
-
-    @cached_property
-    def default_image(self):
-        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_pdf.png", repo_type="dataset"
-        )
-        image = Image.open(filepath).convert("RGB")
-        return image
-
-    def test_forward_pass(self):
-        processor = self.default_processor
-        model = self.default_model
-        image = self.default_image
-        pixel_values = processor(images=image, return_tensors="ms").pixel_values
-
-        decoder_input_ids = mindspore.tensor([[0]])
-        outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
-        logits = outputs.logits
-
-        # verify the logits
-        expected_shape = (1, 1, model.decoder.config.vocab_size)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [1.6253, -4.2179, 5.8532, -2.7911, -5.0609, -4.7397, -4.2890, -5.1073, -4.8908, -4.9729]
-        )
-
-        self.assertTrue(ops.allclose(logits[0, 0, :10], expected_slice, atol=1e-4))
-
-    def test_generation(self):
-        processor = self.default_processor
-        model = self.default_model
-        image = self.default_image
-        pixel_values = processor(images=image, return_tensors="ms").pixel_values
-
-        outputs = model.generate(
-            pixel_values,
-            min_length=1,
-            max_length=3584,
-            bad_words_ids=[[processor.tokenizer.unk_token_id]],
-            return_dict_in_generate=True,
-            output_scores=True,
-        )
-
-        # verify generated sequence
-        generated = processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0]
-        expected_raw_generation = "# Nougat: Neural Optical Understanding for Academic Documents\n\n Lukas Blecher\n\nCorrespondence to: lblecher@meta.com\n\nGuillem Cucurull\n\nThomas Scialom\n\nRobert Stojnic\n\nMeta AI\n\nThe paper reports 8.1M papers but the authors recently updated the numbers on the GitHub page https://github.com/allenai/s2orc\n\n###### Abstract\n\nScientific knowledge is predominantly stored in books and scientific journals, often in the form of PDFs. However, the PDF format leads to a loss of semantic information, particularly for mathematical expressions. We propose Nougat (**N**eural **O**ptical **U**nderstanding for **A**cademic Documents), a Visual Transformer model that performs an _Optical Character Recognition_ (OCR) task for processing scientific documents into a markup language, and demonstrate the effectiveness of our model on a new dataset of scientific documents. The proposed approach offers a promising solution to enhance the accessibility of scientific knowledge in the digital age, by bridging the gap between human-readable documents and machine-readable text. We release the models and code to accelerate future work on scientific text recognition.\n\n## 1 Introduction\n\nThe majority of scientific knowledge is stored in books or published in scientific journals, most commonly in the Portable Document Format (PDF). Next to HTML, PDFs are the second most prominent data format on the internet, making up 2.4% of common crawl [1]. However, the information stored in these files is very difficult to extract into any other formats. This is especially true for highly specialized documents, such as scientific research papers, where the semantic information of mathematical expressions is lost.\n\nExisting Optical Character Recognition (OCR) engines, such as Tesseract OCR [2], excel at detecting and classifying individual characters and words in an image, but fail to understand the relationship between them due to their line-by-line approach. This means that they treat superscripts and subscripts in the same way as the surrounding text, which is a significant drawback for mathematical expressions. In mathematical notations like fractions, exponents, and matrices, relative positions of characters are crucial.\n\nConverting academic research papers into machine-readable text also enables accessibility and searchability of science as a whole. The information of millions of academic papers can not be fully accessed because they are locked behind an unreadable format. Existing corpora, such as the S2ORC dataset [3], capture the text of 12M2 papers using GROBID [4], but are missing meaningful representations of the mathematical equations.\n\nFootnote 2: The paper reports 8.1M papers but the authors recently updated the numbers on the GitHub page https://github.com/allenai/s2orc\n\nTo this end, we introduce Nougat, a transformer based model that can convert images of document pages to formatted markup text.\n\nThe primary contributions in this paper are\n\n* Release of a pre-trained model capable of converting a PDF to a lightweight markup language. We release the code and the model on GitHub3 Footnote 3: https://github.com/facebookresearch/nougat\n* We introduce a pipeline to create dataset for pairing PDFs to source code\n* Our method is only dependent on the image of a page, allowing access to scanned papers and books"
-        self.assertTrue(generated == expected_raw_generation)
-
-        # verify postprocessed sequence
-        generated = processor.post_process_generation(generated, fix_markdown=False)
-        expected_generation = "\n\n# Nougat: Neural Optical Understanding for Academic Documents\n\n Lukas Blecher\n\nCorrespondence to: lblecher@meta.com\n\nGuillem Cucurull\n\nThomas Scialom\n\nRobert Stojnic\n\nMeta AI\n\nThe paper reports 8.1M papers but the authors recently updated the numbers on the GitHub page https://github.com/allenai/s2orc\n\n###### Abstract\n\nScientific knowledge is predominantly stored in books and scientific journals, often in the form of PDFs. However, the PDF format leads to a loss of semantic information, particularly for mathematical expressions. We propose Nougat (**N**eural **O**ptical **U**nderstanding for **A**cademic Documents), a Visual Transformer model that performs an _Optical Character Recognition_ (OCR) task for processing scientific documents into a markup language, and demonstrate the effectiveness of our model on a new dataset of scientific documents. The proposed approach offers a promising solution to enhance the accessibility of scientific knowledge in the digital age, by bridging the gap between human-readable documents and machine-readable text. We release the models and code to accelerate future work on scientific text recognition.\n\n## 1 Introduction\n\nThe majority of scientific knowledge is stored in books or published in scientific journals, most commonly in the Portable Document Format (PDF). Next to HTML, PDFs are the second most prominent data format on the internet, making up 2.4% of common crawl [1]. However, the information stored in these files is very difficult to extract into any other formats. This is especially true for highly specialized documents, such as scientific research papers, where the semantic information of mathematical expressions is lost.\n\nExisting Optical Character Recognition (OCR) engines, such as Tesseract OCR [2], excel at detecting and classifying individual characters and words in an image, but fail to understand the relationship between them due to their line-by-line approach. This means that they treat superscripts and subscripts in the same way as the surrounding text, which is a significant drawback for mathematical expressions. In mathematical notations like fractions, exponents, and matrices, relative positions of characters are crucial.\n\nConverting academic research papers into machine-readable text also enables accessibility and searchability of science as a whole. The information of millions of academic papers can not be fully accessed because they are locked behind an unreadable format. Existing corpora, such as the S2ORC dataset [3], capture the text of 12M2 papers using GROBID [4], but are missing meaningful representations of the mathematical equations.\n\nFootnote 2: The paper reports 8.1M papers but the authors recently updated the numbers on the GitHub page https://github.com/allenai/s2orc\n\nTo this end, we introduce Nougat, a transformer based model that can convert images of document pages to formatted markup text.\n\nThe primary contributions in this paper are\n\n* Release of a pre-trained model capable of converting a PDF to a lightweight markup language. We release the code and the model on GitHub3 Footnote 3: https://github.com/facebookresearch/nougat\n* We introduce a pipeline to create dataset for pairing PDFs to source code\n* Our method is only dependent on the image of a page, allowing access to scanned papers and books"
-        self.assertTrue(generated == expected_generation)
-
-        # verify scores
-        self.assertEqual(len(outputs.scores), 741)
-        self.assertTrue(
-            ops.allclose(
-                outputs.scores[0][0, :3], mindspore.tensor([1.6253, -4.2179, 5.8532]), atol=1e-4
-            )
-        )
\ No newline at end of file
diff --git a/tests/transformers/models/vision_text_dual_encoder/__init__.py b/tests/transformers/models/vision_text_dual_encoder/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py b/tests/transformers/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
deleted file mode 100644
index 31de3ebac..000000000
--- a/tests/transformers/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
+++ /dev/null
@@ -1,430 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================
-"""Testing suite for the Mindspore VisionTextDualEncoder model."""
-
-import collections
-import tempfile
-import unittest
-
-import numpy as np
-
-from mindnlp.utils.testing_utils import (require_mindspore, require_vision, slow,
-                                         is_vision_available, is_mindspore_available)
-
-
-from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
-from ..bert.test_modeling_bert import BertModelTester
-from ..clip.test_modeling_clip import CLIPVisionModelTester
-from ..deit.test_modeling_deit import DeiTModelTester
-from ..roberta.test_modeling_roberta import RobertaModelTester
-from ..vit.test_modeling_vit import ViTModelTester
-
-
-if is_mindspore_available():
-    import mindspore as ms
-
-    from mindnlp.transformers import (
-        BertModel,
-        CLIPVisionModel,
-        DeiTModel,
-        RobertaModel,
-        VisionTextDualEncoderConfig,
-        VisionTextDualEncoderModel,
-        ViTModel,
-    )
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import VisionTextDualEncoderProcessor
-
-
-# Inspired by
-# https://github.com/rwightman/pytorch-image-models/blob/b9bd960a032c75ca6b808ddeed76bee5f3ed4972/timm/models/layers/helpers.py
-# From PyTorch internals
-def to_2tuple(x):
-    if isinstance(x, collections.abc.Iterable):
-        return x
-    return (x, x)
-
-
-@require_mindspore
-class VisionTextDualEncoderMixin:
-    def get_vision_text_model(self, config, text_config):
-        pass
-
-    def prepare_config_and_inputs(self):
-        pass
-
-    def get_pretrained_model_and_inputs(self):
-        pass
-
-    def check_model_from_pretrained_configs(
-        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
-    ):
-        config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
-
-        model = VisionTextDualEncoderModel(config)
-        model.set_train(False)
-
-        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-
-        self.assertEqual(output["text_embeds"].shape, (input_ids.shape[0], config.projection_dim))
-        self.assertEqual(output["image_embeds"].shape, (pixel_values.shape[0], config.projection_dim))
-
-    def check_vision_text_dual_encoder_model(
-        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
-    ):
-        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
-        model = VisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
-        model.set_train(False)
-
-        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-
-        self.assertEqual(output["text_embeds"].shape, (input_ids.shape[0], model.config.projection_dim))
-        self.assertEqual(output["image_embeds"].shape, (pixel_values.shape[0], model.config.projection_dim))
-
-    def check_vision_text_dual_encoder_from_pretrained(
-        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
-    ):
-        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
-        kwargs = {"vision_model": vision_model, "text_model": text_model}
-        model = VisionTextDualEncoderModel.from_vision_text_pretrained(**kwargs)
-        model.set_train(False)
-
-        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-
-        self.assertEqual(output["text_embeds"].shape, (input_ids.shape[0], model.config.projection_dim))
-        self.assertEqual(output["image_embeds"].shape, (pixel_values.shape[0], model.config.projection_dim))
-
-    def check_save_load(self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs):
-        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
-        model = VisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
-        model.set_train(False)
-
-        output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-        out_1 = output[0].numpy()
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            model = VisionTextDualEncoderModel.from_pretrained(tmpdirname).set_train(False)
-
-            after_output = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
-            out_2 = after_output[0].numpy()
-            max_diff = np.amax(np.abs(out_2 - out_1))
-            self.assertLessEqual(max_diff, 1e-5)
-
-    def check_vision_text_output_attention(
-        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
-    ):
-        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
-        model = VisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
-        model.set_train(False)
-
-        output = model(
-            input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, output_attentions=True
-        )
-
-        vision_attentions = output.vision_model_output.attentions
-        self.assertEqual(len(vision_attentions), vision_config.num_hidden_layers)
-
-        # in ViT, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
-        image_size = to_2tuple(vision_model.config.image_size)
-        patch_size = to_2tuple(vision_model.config.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        seq_len = num_patches + 1
-        self.assertEqual(vision_attentions[0].shape[-3:], (vision_config.num_attention_heads, seq_len, seq_len))
-
-        text_attentions = output.text_model_output.attentions
-        self.assertEqual(len(text_attentions), text_config.num_hidden_layers)
-
-        self.assertEqual(
-            text_attentions[0].shape[-3:],
-            (text_config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]),
-        )
-
-    def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
-        diff = np.abs((a - b)).max()
-        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
-
-    def test_vision_text_dual_encoder_model(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        self.check_vision_text_dual_encoder_model(**inputs_dict)
-
-    def test_model_from_pretrained_configs(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        self.check_model_from_pretrained_configs(**inputs_dict)
-
-    def test_vision_text_dual_encoder_from_pretrained(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        self.check_vision_text_dual_encoder_from_pretrained(**inputs_dict)
-
-    def test_save_load(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        self.check_save_load(**inputs_dict)
-
-    def test_vision_text_output_attention(self):
-        inputs_dict = self.prepare_config_and_inputs()
-        self.check_vision_text_output_attention(**inputs_dict)
-
-    @slow
-    def test_real_model_save_load_from_pretrained(self):
-        model_2, inputs = self.get_pretrained_model_and_inputs()
-
-        outputs = model_2(**inputs)
-        out_2 = outputs[0].numpy()
-
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            model_2.save_pretrained(tmp_dirname)
-            model_1 = VisionTextDualEncoderModel.from_pretrained(tmp_dirname)
-
-            after_outputs = model_1(**inputs)
-            out_1 = after_outputs[0].numpy()
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-
-@require_mindspore
-class ViTBertModelTest(VisionTextDualEncoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        model = VisionTextDualEncoderModel.from_vision_text_pretrained(
-            vision_model_name_or_path="hf-internal-testing/tiny-random-vit",
-            text_model_name_or_path="hf-internal-testing/tiny-bert"
-        )
-        batch_size = 13
-        pixel_values = floats_tensor(
-            [
-                batch_size,
-                model.vision_model.config.num_channels,
-                model.vision_model.config.image_size,
-                model.vision_model.config.image_size,
-            ]
-        )
-        input_ids = ids_tensor([batch_size, 4], model.text_model.config.vocab_size)
-        attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}
-
-        return model, inputs
-
-    def get_vision_text_model(self, vision_config, text_config):
-        vision_model = ViTModel(vision_config).set_train(False)
-        text_model = BertModel(text_config).set_train(False)
-        return vision_model, text_model
-
-    def prepare_config_and_inputs(self):
-        vit_model_tester = ViTModelTester(self)
-        bert_model_tester = BertModelTester(self)
-        vision_config_and_inputs = vit_model_tester.prepare_config_and_inputs()
-        text_config_and_inputs = bert_model_tester.prepare_config_and_inputs()
-
-        vision_config, pixel_values, _ = vision_config_and_inputs
-
-        (
-            text_config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = text_config_and_inputs
-
-        return {
-            "text_config": text_config,
-            "vision_config": vision_config,
-            "pixel_values": pixel_values,
-            "attention_mask": input_mask,
-            "input_ids": input_ids,
-            "text_token_type_ids": token_type_ids,
-            "text_sequence_labels": sequence_labels,
-            "text_token_labels": token_labels,
-            "text_choice_labels": choice_labels,
-        }
-
-
-@require_mindspore
-class DeiTRobertaModelTest(VisionTextDualEncoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        model = VisionTextDualEncoderModel.from_vision_text_pretrained(
-            vision_model_name_or_path="hf-internal-testing/tiny-random-deit",
-            text_model_name_or_path="hf-internal-testing/tiny-random-roberta"
-        )
-        batch_size = 13
-        pixel_values = floats_tensor(
-            [
-                batch_size,
-                model.vision_model.config.num_channels,
-                model.vision_model.config.image_size,
-                model.vision_model.config.image_size,
-            ]
-        )
-        input_ids = ids_tensor([batch_size, 4], model.text_model.config.vocab_size)
-        attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}
-
-        return model, inputs
-
-    def check_vision_text_output_attention(
-        self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
-    ):
-        vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
-        model = VisionTextDualEncoderModel(vision_model=vision_model, text_model=text_model)
-        model.set_train(False)
-
-        output = model(
-            input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, output_attentions=True
-        )
-
-        vision_attentions = output.vision_model_output.attentions
-        self.assertEqual(len(vision_attentions), vision_config.num_hidden_layers)
-
-        # in DEiT, the seq_len equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
-        image_size = to_2tuple(vision_model.config.image_size)
-        patch_size = to_2tuple(vision_model.config.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        seq_len = num_patches + 2
-        self.assertEqual(vision_attentions[0].shape[-3:], (vision_config.num_attention_heads, seq_len, seq_len))
-
-        text_attentions = output.text_model_output.attentions
-        self.assertEqual(len(text_attentions), text_config.num_hidden_layers)
-
-        self.assertEqual(
-            text_attentions[0].shape[-3:],
-            (text_config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]),
-        )
-
-    def get_vision_text_model(self, vision_config, text_config):
-        vision_model = DeiTModel(vision_config).set_train(False)
-        text_model = RobertaModel(text_config).set_train(False)
-        return vision_model, text_model
-
-    def prepare_config_and_inputs(self):
-        vit_model_tester = DeiTModelTester(self)
-        bert_model_tester = RobertaModelTester(self)
-        vision_config_and_inputs = vit_model_tester.prepare_config_and_inputs()
-        text_config_and_inputs = bert_model_tester.prepare_config_and_inputs()
-
-        vision_config, pixel_values, _ = vision_config_and_inputs
-
-        (
-            text_config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = text_config_and_inputs
-
-        return {
-            "text_config": text_config,
-            "vision_config": vision_config,
-            "pixel_values": pixel_values,
-            "attention_mask": input_mask,
-            "input_ids": input_ids,
-            "text_token_type_ids": token_type_ids,
-            "text_sequence_labels": sequence_labels,
-            "text_token_labels": token_labels,
-            "text_choice_labels": choice_labels,
-        }
-
-    # skip as DeiT is not available in Flax
-    def test_pt_flax_equivalence(self):
-        pass
-
-
-@require_mindspore
-class CLIPVisionBertModelTest(VisionTextDualEncoderMixin, unittest.TestCase):
-    def get_pretrained_model_and_inputs(self):
-        model = VisionTextDualEncoderModel.from_vision_text_pretrained(
-            vision_model_name_or_path="hf-internal-testing/tiny-random-clip",
-            text_model_name_or_path="hf-internal-testing/tiny-bert"
-        )
-        batch_size = 13
-        pixel_values = floats_tensor(
-            [
-                batch_size,
-                model.vision_model.config.num_channels,
-                model.vision_model.config.image_size,
-                model.vision_model.config.image_size,
-            ]
-        )
-        input_ids = ids_tensor([batch_size, 4], model.text_model.config.vocab_size)
-        attention_mask = random_attention_mask([batch_size, 4])
-        inputs = {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}
-
-        return model, inputs
-
-    def get_vision_text_model(self, vision_config, text_config):
-        vision_model = CLIPVisionModel(vision_config).set_train(False)
-        text_model = BertModel(text_config).set_train(False)
-        return vision_model, text_model
-
-    def prepare_config_and_inputs(self):
-        clip_model_tester = CLIPVisionModelTester(self)
-        bert_model_tester = BertModelTester(self)
-        vision_config_and_inputs = clip_model_tester.prepare_config_and_inputs()
-        text_config_and_inputs = bert_model_tester.prepare_config_and_inputs()
-
-        vision_config, pixel_values = vision_config_and_inputs
-
-        (
-            text_config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = text_config_and_inputs
-
-        return {
-            "text_config": text_config,
-            "vision_config": vision_config,
-            "pixel_values": pixel_values,
-            "attention_mask": input_mask,
-            "input_ids": input_ids,
-            "text_token_type_ids": token_type_ids,
-            "text_sequence_labels": sequence_labels,
-            "text_token_labels": token_labels,
-            "text_choice_labels": choice_labels,
-        }
-
-
-@require_vision
-@require_mindspore
-class VisionTextDualEncoderIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model = VisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian", logit_scale_init_value=1.0, ignore_mismatched_sizes=True)
-        processor = VisionTextDualEncoderProcessor.from_pretrained("clip-italian/clip-italian", ignore_mismatched_sizes=True)
-
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        inputs = processor(
-            text=["una foto di un gatto", "una foto di un cane"], images=image, padding=True, return_tensors="ms"
-        )
-
-        outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(outputs.logits_per_image.shape, (inputs.pixel_values.shape[0], inputs.input_ids.shape[0]))
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            (inputs.input_ids.shape[0], inputs.pixel_values.shape[0]),
-        )
-
-        expected_logits = ms.tensor([[1.2284727, 0.3104122]])
-
-        self.assertTrue(np.allclose(outputs.logits_per_image.asnumpy(), expected_logits.asnumpy(), atol=1e-3))
\ No newline at end of file
diff --git a/tests/transformers/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py b/tests/transformers/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
deleted file mode 100644
index 7e850c33b..000000000
--- a/tests/transformers/models/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import BertTokenizerFast
-from mindnlp.transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES, BertTokenizer
-from mindnlp.utils.testing_utils import require_tokenizers, require_vision, is_vision_available
-from mindnlp.configs import IMAGE_PROCESSOR_NAME
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import VisionTextDualEncoderProcessor, ViTImageProcessor
-
-
-@require_tokenizers
-@require_vision
-class VisionTextDualEncoderProcessorTest(unittest.TestCase):
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", ",", "low", "lowest"]  # fmt: skip
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-        image_processor_map = {
-            "do_resize": True,
-            "size": {"height": 18, "width": 18},
-            "do_normalize": True,
-            "image_mean": [0.5, 0.5, 0.5],
-            "image_std": [0.5, 0.5, 0.5],
-        }
-        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
-        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
-            json.dump(image_processor_map, fp)
-
-    def get_tokenizer(self, **kwargs):
-        return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_image_processor(self, **kwargs):
-        return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def prepare_image_inputs(self):
-        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-        or a list of PyTorch tensors if one specifies torchify=True.
-        """
-
-        image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
-
-        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
-
-        return image_inputs
-
-    def test_save_load_pretrained_default(self):
-        tokenizer = self.get_tokenizer()
-        image_processor = self.get_image_processor()
-
-        processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        processor.save_pretrained(self.tmpdirname)
-        processor = VisionTextDualEncoderProcessor.from_pretrained(self.tmpdirname)
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertIsInstance(processor.tokenizer, (BertTokenizer, BertTokenizerFast))
-
-        self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
-        self.assertIsInstance(processor.image_processor, ViTImageProcessor)
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = VisionTextDualEncoderProcessor(
-            tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()
-        )
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
-
-        processor = VisionTextDualEncoderProcessor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, (BertTokenizer, BertTokenizerFast))
-
-        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.image_processor, ViTImageProcessor)
-
-    def test_image_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        image_input = self.prepare_image_inputs()
-
-        input_feat_extract = image_processor(image_input, return_tensors="np")
-        input_processor = processor(images=image_input, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_tokenizer(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tok = tokenizer(input_str)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_processor(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertListEqual(list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values"])
-
-        # test if it raises when no input is passed
-        with self.assertRaises(ValueError):
-            processor()
-
-    def test_tokenizer_decode(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_model_input_names(self):
-        image_processor = self.get_image_processor()
-        tokenizer = self.get_tokenizer()
-
-        processor = VisionTextDualEncoderProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-
-        self.assertListEqual(list(inputs.keys()), processor.model_input_names)
\ No newline at end of file
diff --git a/tests/transformers/models/visual_bert/__init__.py b/tests/transformers/models/visual_bert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/visual_bert/test_modeling_visual_bert.py b/tests/transformers/models/visual_bert/test_modeling_visual_bert.py
deleted file mode 100644
index 4b87fed57..000000000
--- a/tests/transformers/models/visual_bert/test_modeling_visual_bert.py
+++ /dev/null
@@ -1,694 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore VisualBERT model."""
-
-import copy
-import unittest
-
-from mindnlp.transformers import VisualBertConfig, is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import (
-        VisualBertForMultipleChoice,
-        VisualBertForPreTraining,
-        VisualBertForQuestionAnswering,
-        VisualBertForRegionToPhraseAlignment,
-        VisualBertForVisualReasoning,
-        VisualBertModel,
-    )
-
-
-class VisualBertModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        visual_seq_length=5,
-        is_training=True,
-        use_attention_mask=True,
-        use_visual_attention_mask=True,
-        use_token_type_ids=True,
-        use_visual_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        visual_embedding_dim=20,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.visual_seq_length = visual_seq_length
-        self.is_training = is_training
-        self.use_attention_mask = use_attention_mask
-        self.use_visual_attention_mask = use_visual_attention_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_visual_token_type_ids = use_visual_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.visual_embedding_dim = visual_embedding_dim
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def get_config(self):
-        return VisualBertConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            visual_embedding_dim=self.visual_embedding_dim,
-            num_labels=self.num_labels,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        visual_embeds = floats_tensor([self.batch_size, self.visual_seq_length, self.visual_embedding_dim])
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ops.ones((self.batch_size, self.seq_length), dtype=mindspore.int64)
-
-        visual_attention_mask = None
-        if self.use_visual_attention_mask:
-            visual_attention_mask = ops.ones(
-                (self.batch_size, self.visual_seq_length), dtype=mindspore.int64
-            )
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        visual_token_type_ids = None
-        if self.use_visual_token_type_ids:
-            visual_token_type_ids = ids_tensor([self.batch_size, self.visual_seq_length], self.type_vocab_size)
-
-        config = self.get_config()
-        return config, {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": attention_mask,
-            "visual_embeds": visual_embeds,
-            "visual_token_type_ids": visual_token_type_ids,
-            "visual_attention_mask": visual_attention_mask,
-        }
-
-    def prepare_config_and_inputs_for_pretraining(self):
-        masked_lm_labels = None
-        sentence_image_labels = None
-
-        if self.use_labels:
-            masked_lm_labels = ids_tensor([self.batch_size, self.seq_length + self.visual_seq_length], self.vocab_size)
-            sentence_image_labels = ids_tensor(
-                [self.batch_size],
-                self.type_sequence_label_size,
-            )
-
-        config, input_dict = self.prepare_config_and_inputs_for_common()
-
-        input_dict.update({"labels": masked_lm_labels, "sentence_image_labels": sentence_image_labels})
-
-        return config, input_dict
-
-    def prepare_config_and_inputs_for_multiple_choice(self):
-        input_ids = ids_tensor([self.batch_size, self.num_choices, self.seq_length], self.vocab_size)
-        visual_embeds = floats_tensor(
-            [self.batch_size, self.num_choices, self.visual_seq_length, self.visual_embedding_dim]
-        )
-
-        attention_mask = None
-        if self.use_attention_mask:
-            attention_mask = ops.ones(
-                (self.batch_size, self.num_choices, self.seq_length), dtype=mindspore.int64
-            )
-
-        visual_attention_mask = None
-        if self.use_visual_attention_mask:
-            visual_attention_mask = ops.ones(
-                (self.batch_size, self.num_choices, self.visual_seq_length), dtype=mindspore.int64
-            )
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.num_choices, self.seq_length], self.type_vocab_size)
-
-        visual_token_type_ids = None
-        if self.use_visual_token_type_ids:
-            visual_token_type_ids = ids_tensor(
-                [self.batch_size, self.num_choices, self.visual_seq_length], self.type_vocab_size
-            )
-
-        labels = None
-
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-        return config, {
-            "input_ids": input_ids,
-            "token_type_ids": token_type_ids,
-            "attention_mask": attention_mask,
-            "visual_embeds": visual_embeds,
-            "visual_token_type_ids": visual_token_type_ids,
-            "visual_attention_mask": visual_attention_mask,
-            "labels": labels,
-        }
-
-    def prepare_config_and_inputs_for_vqa(self):
-        vqa_labels = None
-
-        if self.use_labels:
-            vqa_labels = floats_tensor([self.batch_size, self.num_labels])
-
-        config, input_dict = self.prepare_config_and_inputs_for_common()
-
-        input_dict.update({"labels": vqa_labels})
-        return config, input_dict
-
-    def prepare_config_and_inputs_for_nlvr(self):
-        nlvr_labels = None
-
-        if self.use_labels:
-            nlvr_labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config, input_dict = self.prepare_config_and_inputs_for_common()
-
-        input_dict.update({"labels": nlvr_labels})
-        return config, input_dict
-
-    def prepare_config_and_inputs_for_flickr(self):
-        region_to_phrase_position = ops.cat(
-            (
-                ids_tensor([self.batch_size, self.seq_length], self.visual_seq_length),
-                ops.ones(self.batch_size, self.visual_seq_length, dtype=mindspore.int64) * -1,
-            ),
-            dim=-1,
-        )
-        flickr_labels = None
-        if self.use_labels:
-            flickr_labels = floats_tensor(
-                [self.batch_size, self.seq_length + self.visual_seq_length, self.visual_seq_length]
-            )
-
-        config, input_dict = self.prepare_config_and_inputs_for_common()
-
-        input_dict.update({"region_to_phrase_position": region_to_phrase_position, "labels": flickr_labels})
-        return config, input_dict
-
-    def create_and_check_model(self, config, input_dict):
-        model = VisualBertModel(config=config)
-        model.eval()
-        result = model(**input_dict)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.seq_length + self.visual_seq_length, self.hidden_size),
-        )
-
-    def create_and_check_for_pretraining(self, config, input_dict):
-        model = VisualBertForPreTraining(config=config)
-        model.eval()
-        result = model(**input_dict)
-        self.parent.assertEqual(
-            result.prediction_logits.shape,
-            (self.batch_size, self.seq_length + self.visual_seq_length, self.vocab_size),
-        )
-
-    def create_and_check_for_vqa(self, config, input_dict):
-        model = VisualBertForQuestionAnswering(config=config)
-        model.eval()
-        result = model(**input_dict)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_multiple_choice(self, config, input_dict):
-        model = VisualBertForMultipleChoice(config=config)
-        model.eval()
-        result = model(**input_dict)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_nlvr(self, config, input_dict):
-        model = VisualBertForVisualReasoning(config=config)
-        model.eval()
-        result = model(**input_dict)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_flickr(self, config, input_dict):
-        model = VisualBertForRegionToPhraseAlignment(config=config)
-        model.eval()
-        result = model(**input_dict)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.seq_length + self.visual_seq_length, self.visual_seq_length)
-        )
-
-
-@require_mindspore
-class VisualBertModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            VisualBertModel,
-            VisualBertForMultipleChoice,
-            VisualBertForVisualReasoning,
-            VisualBertForRegionToPhraseAlignment,
-            VisualBertForQuestionAnswering,
-            VisualBertForPreTraining,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = {"feature-extraction": VisualBertModel} if is_mindspore_available() else {}
-    test_torchscript = False
-    test_pruning = False
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-        if model_class == VisualBertForMultipleChoice:
-            for key in inputs_dict.keys():
-                value = inputs_dict[key]
-                if isinstance(value, mindspore.Tensor) and value.ndim > 1:
-                    if key != "visual_embeds":
-                        inputs_dict[key] = (
-                            inputs_dict[key].unsqueeze(1).broadcast_to((-1, self.model_tester.num_choices, -1))
-                        )
-                    else:
-                        inputs_dict[key] = (
-                            inputs_dict[key]
-                            .unsqueeze(1)
-                            .broadcast_to((-1, self.model_tester.num_choices, -1, self.model_tester.visual_embedding_dim))
-                        )
-
-        elif model_class == VisualBertForRegionToPhraseAlignment:
-            total_length = self.model_tester.seq_length + self.model_tester.visual_seq_length
-            batch_size = self.model_tester.batch_size
-            inputs_dict["region_to_phrase_position"] = ops.zeros(
-                (batch_size, total_length),
-                dtype=mindspore.int64,
-            )
-
-        if return_labels:
-            if model_class == VisualBertForMultipleChoice:
-                inputs_dict["labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-            elif model_class == VisualBertForPreTraining:
-                total_length = self.model_tester.seq_length + self.model_tester.visual_seq_length
-                batch_size = self.model_tester.batch_size
-                inputs_dict["labels"] = ops.zeros(
-                    (batch_size, total_length),
-                    dtype=mindspore.int64,
-                )
-                inputs_dict["sentence_image_labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-
-            # Flickr expects float labels
-            elif model_class == VisualBertForRegionToPhraseAlignment:
-                batch_size = self.model_tester.batch_size
-                total_length = self.model_tester.seq_length + self.model_tester.visual_seq_length
-
-                inputs_dict["labels"] = ops.ones(
-                    (
-                        batch_size,
-                        total_length,
-                        self.model_tester.visual_seq_length,
-                    ),
-                    dtype=mindspore.float32,
-                )
-
-            # VQA expects float labels
-            elif model_class == VisualBertForQuestionAnswering:
-                inputs_dict["labels"] = ops.ones(
-                    (self.model_tester.batch_size, self.model_tester.num_labels),
-                    dtype=mindspore.float32,
-                )
-
-            elif model_class == VisualBertForVisualReasoning:
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size), dtype=mindspore.int64
-                )
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = VisualBertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=VisualBertConfig, hidden_size=37)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        visual_seq_len = getattr(self.model_tester, "visual_seq_length", None)
-
-        encoder_seq_length = (seq_len if seq_len is not None else 0) + (
-            visual_seq_len if visual_seq_len is not None else 0
-        )
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
-                    seq_length = seq_length * self.model_tester.chunk_length
-            else:
-                seq_length = self.model_tester.seq_length + self.model_tester.visual_seq_length
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pretraining()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    def test_model_for_vqa(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_vqa()
-        self.model_tester.create_and_check_for_vqa(*config_and_inputs)
-
-    def test_model_for_nlvr(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_nlvr()
-        self.model_tester.create_and_check_for_nlvr(*config_and_inputs)
-
-    def test_model_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_multiple_choice()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_model_for_flickr(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_flickr()
-        self.model_tester.create_and_check_for_flickr(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "uclanlp/visualbert-vqa"
-        model = VisualBertModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-
-@require_mindspore
-class VisualBertModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference_vqa_coco_pre(self):
-        model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
-
-        input_ids = mindspore.tensor([1, 2, 3, 4, 5, 6], dtype=mindspore.int64).reshape(1, -1)
-        token_type_ids = mindspore.tensor([0, 0, 0, 1, 1, 1], dtype=mindspore.int64).reshape(1, -1)
-        visual_embeds = ops.ones(size=(1, 10, 2048), dtype=mindspore.float32) * 0.5
-        visual_token_type_ids = ops.ones(size=(1, 10), dtype=mindspore.int64)
-        attention_mask = mindspore.tensor([1] * 6).reshape(1, -1)
-        visual_attention_mask = mindspore.tensor([1] * 10).reshape(1, -1)
-
-        with no_grad():
-            output = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                token_type_ids=token_type_ids,
-                visual_embeds=visual_embeds,
-                visual_attention_mask=visual_attention_mask,
-                visual_token_type_ids=visual_token_type_ids,
-            )
-
-        vocab_size = 30522
-
-        expected_shape = (1, 16, vocab_size)
-        self.assertEqual(output.prediction_logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[[-5.1858, -5.1903, -4.9142], [-6.2214, -5.9238, -5.8381], [-6.3027, -5.9939, -5.9297]]]
-        )
-
-        self.assertTrue(ops.allclose(output.prediction_logits[:, :3, :3], expected_slice, atol=1e-4))
-
-        expected_shape_2 = (1, 2)
-        self.assertEqual(output.seq_relationship_logits.shape, expected_shape_2)
-
-        expected_slice_2 = mindspore.tensor([[0.7393, 0.1754]])
-
-        self.assertTrue(ops.allclose(output.seq_relationship_logits, expected_slice_2, atol=1e-4))
-
-    @slow
-    def test_inference_vqa(self):
-        model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")
-
-        input_ids = mindspore.tensor([1, 2, 3, 4, 5, 6], dtype=mindspore.int64).reshape(1, -1)
-        token_type_ids = mindspore.tensor([0, 0, 0, 1, 1, 1], dtype=mindspore.int64).reshape(1, -1)
-        visual_embeds = ops.ones(size=(1, 10, 2048), dtype=mindspore.float32) * 0.5
-        visual_token_type_ids = ops.ones(size=(1, 10), dtype=mindspore.int64)
-        attention_mask = mindspore.tensor([1] * 6).reshape(1, -1)
-        visual_attention_mask = mindspore.tensor([1] * 10).reshape(1, -1)
-
-        with no_grad():
-            output = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                token_type_ids=token_type_ids,
-                visual_embeds=visual_embeds,
-                visual_attention_mask=visual_attention_mask,
-                visual_token_type_ids=visual_token_type_ids,
-            )
-
-        # vocab_size = 30522
-
-        expected_shape = (1, 3129)
-        self.assertEqual(output.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[-8.9898, 3.0803, -1.8016, 2.4542, -8.3420, -2.0224, -3.3124, -4.4139, -3.1491, -3.8997]]
-        )
-
-        self.assertTrue(ops.allclose(output.logits[:, :10], expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_nlvr(self):
-        model = VisualBertForVisualReasoning.from_pretrained("uclanlp/visualbert-nlvr2")
-
-        input_ids = mindspore.tensor([1, 2, 3, 4, 5, 6], dtype=mindspore.int64).reshape(1, -1)
-        token_type_ids = mindspore.tensor([0, 0, 0, 1, 1, 1], dtype=mindspore.int64).reshape(1, -1)
-        visual_embeds = ops.ones(size=(1, 10, 1024), dtype=mindspore.float32) * 0.5
-        visual_token_type_ids = ops.ones(size=(1, 10), dtype=mindspore.int64)
-        attention_mask = mindspore.tensor([1] * 6).reshape(1, -1)
-        visual_attention_mask = mindspore.tensor([1] * 10).reshape(1, -1)
-
-        with no_grad():
-            output = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                token_type_ids=token_type_ids,
-                visual_embeds=visual_embeds,
-                visual_attention_mask=visual_attention_mask,
-                visual_token_type_ids=visual_token_type_ids,
-            )
-
-        # vocab_size = 30522
-
-        expected_shape = (1, 2)
-        self.assertEqual(output.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([[-1.1436, 0.8900]])
-
-        self.assertTrue(ops.allclose(output.logits, expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_vcr(self):
-        model = VisualBertForMultipleChoice.from_pretrained("uclanlp/visualbert-vcr")
-
-        input_ids = mindspore.tensor([[[1, 2, 3, 4, 5, 6] for i in range(4)]], dtype=mindspore.int64)
-        attention_mask = ops.ones_like(input_ids)
-        token_type_ids = ops.ones_like(input_ids)
-
-        visual_embeds = ops.ones(size=(1, 4, 10, 512), dtype=mindspore.float32) * 0.5
-        visual_token_type_ids = ops.ones(size=(1, 4, 10), dtype=mindspore.int64)
-        visual_attention_mask = ops.ones_like(visual_token_type_ids)
-
-        with no_grad():
-            output = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                token_type_ids=token_type_ids,
-                visual_embeds=visual_embeds,
-                visual_attention_mask=visual_attention_mask,
-                visual_token_type_ids=visual_token_type_ids,
-            )
-
-        # vocab_size = 30522
-
-        expected_shape = (1, 4)
-        self.assertEqual(output.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor([[-7.7697, -7.7697, -7.7697, -7.7697]])
-
-        self.assertTrue(ops.allclose(output.logits, expected_slice, atol=1e-4))
\ No newline at end of file
diff --git a/tests/transformers/models/vit/__init__.py b/tests/transformers/models/vit/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/vit/test_image_processing_vit.py b/tests/transformers/models/vit/test_image_processing_vit.py
deleted file mode 100644
index e67987a8c..000000000
--- a/tests/transformers/models/vit/test_image_processing_vit.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision
-from mindnlp.utils.import_utils import is_vision_available
-
-if is_vision_available():
-    from mindnlp.transformers import ViTImageProcessor
-
-
-class ViTImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        size = size if size is not None else {"height": 18, "width": 18}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_image_processor_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_channels, self.size["height"], self.size["width"]
-
-
-@require_mindspore
-@require_vision
-class ViTImageProcessingTest(unittest.TestCase):
-    image_processing_class = ViTImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = ViTImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
-        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
diff --git a/tests/transformers/models/vit/test_modeling_vit.py b/tests/transformers/models/vit/test_modeling_vit.py
deleted file mode 100644
index be71df797..000000000
--- a/tests/transformers/models/vit/test_modeling_vit.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore ViT model. """
-
-import sys
-import numpy as np
-import unittest
-
-from mindnlp.transformers.models.vit import ViTConfig
-from mindnlp.utils.testing_utils import (
-    require_mindspore,
-    require_vision,
-    slow,
-    is_mindspore_available
-)
-from mindnlp.utils.import_utils import is_vision_available
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import ViTForImageClassification, ViTForMaskedImageModeling, ViTModel, ViTPreTrainedModel
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import ViTImageProcessor
-
-
-class ViTModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        scope=None,
-        encoder_stride=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.encoder_stride = encoder_stride
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return ViTConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            encoder_stride=self.encoder_stride,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = ViTModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_masked_image_modeling(self, config, pixel_values, labels):
-        model = ViTForMaskedImageModeling(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.reconstruction.shape, (self.batch_size, self.num_channels, self.image_size, self.image_size)
-        )
-
-        # test greyscale images
-        config.num_channels = 1
-        model = ViTForMaskedImageModeling(config)
-        model.set_train(False)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.reconstruction.shape, (self.batch_size, 1, self.image_size, self.image_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = ViTForImageClassification(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = ViTForImageClassification(config)
-        model.set_train(False)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            pixel_values,
-            labels,
-        ) = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ViTModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ViT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (
-        (
-            ViTForImageClassification,
-            ViTForMaskedImageModeling,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {"image-feature-extraction": ViTModel, "image-classification": ViTForImageClassification}
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = True
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = ViTModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ViTConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="ViT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Dense))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_masked_image_modeling(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_image_modeling(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/vit-base-patch16-224"
-        model = ViTModel.from_pretrained(model_name, from_pt = True)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class ViTModelIntegrationTest(unittest.TestCase):
-    #@cached_property
-    def default_image_processor(self):
-        return ViTImageProcessor.from_pretrained("google/vit-base-patch16-224", from_pt = True) if is_vision_available() else None
-    @slow
-    def test_inference_image_classification_head(self):
-        model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", from_pt = True)
-
-        #image_processor = self.default_image_processor
-        image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224", from_pt = True)
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-        expected_slice = mindspore.tensor([-0.2744, 0.8215, -0.0836])
-        self.assertTrue(np.allclose(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-    @slow
-    def test_inference_interpolate_pos_encoding(self):
-        # ViT models have an `interpolate_pos_encoding` argument in their forward method,
-        # allowing to interpolate the pre-trained position embeddings in order to use
-        # the model on higher resolutions. The DINO model by Facebook AI leverages this
-        # to visualize self-attention on higher resolution images.
-        model = ViTModel.from_pretrained("facebook/dino-vits8", from_pt = True)
-
-        image_processor = ViTImageProcessor.from_pretrained("facebook/dino-vits8", size=480, from_pt = True)
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-        pixel_values = inputs.pixel_values
-
-        # forward pass
-        outputs = model(pixel_values, interpolate_pos_encoding=True)
-
-        # verify the logits
-        expected_shape = (1, 3601, 384)
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[4.2340, 4.3906, -6.6692], [4.5463, 1.8928, -6.7257], [4.4429, 0.8496, -5.8585]]
-        )
-        self.assertTrue(np.allclose(outputs.last_hidden_state[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-    @slow
-    def test_inference_fp16(self):
-        r"""
-        A small test to make sure that inference work in half precision without any problem.
-        """
-        model = ViTModel.from_pretrained("facebook/dino-vits8", ms_dtype=mindspore.float16, from_pt = True)
-        #image_processor = self.default_image_processor
-        image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224", from_pt = True)
-
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-        pixel_values = inputs.pixel_values
-
-        # forward pass to make sure inference works in fp16
-        _ = model(pixel_values)
\ No newline at end of file
diff --git a/tests/transformers/models/vit_hybrid/__init__.py b/tests/transformers/models/vit_hybrid/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/transformers/models/vit_hybrid/test_modeling_vit_hybrid.py
deleted file mode 100644
index 564025020..000000000
--- a/tests/transformers/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore ViT Hybrid model."""
-
-import unittest
-import numpy as np
-
-from mindnlp.transformers import ViTHybridConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import ViTHybridForImageClassification, ViTHybridImageProcessor, ViTHybridModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-
-class ViTHybridModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=64,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        backbone_featmap_shape=[1, 16, 4, 4],
-        scope=None,
-        attn_implementation="eager",
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.backbone_featmap_shape = backbone_featmap_shape
-        self.attn_implementation = attn_implementation
-
-        # in ViT hybrid, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        # the number of patches is based on the feature map of the backbone, which by default uses an output stride
-        # of 32, which means that the feature map has a spatial resolution of 1/32 of the input image size
-        num_patches = (self.image_size // 32) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        backbone_config = {
-            "global_padding": "same",
-            "layer_type": "bottleneck",
-            "depths": [3, 4, 9],
-            "out_features": ["stage1", "stage2", "stage3"],
-            "embedding_dynamic_padding": True,
-            "hidden_sizes": [4, 8, 16, 32],
-            "num_groups": 2,
-        }
-
-        return ViTHybridConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            backbone_featmap_shape=self.backbone_featmap_shape,
-            backbone_config=backbone_config,
-            backbone=None,
-            attn_implementation=self.attn_implementation,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = ViTHybridModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = ViTHybridForImageClassification(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ViTHybridModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ViT does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (ViTHybridForImageClassification,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"image-feature-extraction": ViTHybridModel, "image-classification": ViTHybridForImageClassification}
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    model_split_percents = [0.5, 0.9]
-
-    def setUp(self):
-        self.model_tester = ViTHybridModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ViTHybridConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="ViT does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip
-    def test_mismatched_shapes_have_properly_initialized_weights():
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            # Skip the check for the backbone
-            for name, module in model.cells_and_names():
-                if module.__class__.__name__ == "ViTHybridPatchEmbeddings":
-                    backbone_params = [f"{name}.{key}" for key in module.parameters_dict().keys()]
-                    break
-
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    if name in backbone_params:
-                        continue
-                    self.assertIn(
-                        ((param.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/vit-hybrid-base-bit-384"
-        model = ViTHybridModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class ViTModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return (
-            ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
-            if is_vision_available()
-            else None
-        )
-
-    @slow
-    def test_inference_image_classification_head(self):
-        model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        model.set_train(False)
-
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = ms.tensor([-1.9090, -0.4993, -0.2389])
-
-        self.assertTrue(np.allclose(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-    @slow
-    def test_accelerate_inference(self):
-        image_processor = ViTHybridImageProcessor.from_pretrained("google/vit-hybrid-base-bit-384")
-        model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384")
-
-        image = prepare_img()
-
-        inputs = image_processor(images=image, return_tensors="ms")
-        outputs = model(**inputs)
-        logits = outputs.logits
-        # model predicts one of the 1000 ImageNet classes
-        predicted_class_idx = logits.argmax(-1).item()
-
-        self.assertTrue(model.config.id2label[predicted_class_idx], "tabby, tabby cat")
\ No newline at end of file
diff --git a/tests/transformers/models/vit_mae/__init__.py b/tests/transformers/models/vit_mae/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/vit_mae/test_modeling_vit_mae.py b/tests/transformers/models/vit_mae/test_modeling_vit_mae.py
deleted file mode 100644
index b690af8d3..000000000
--- a/tests/transformers/models/vit_mae/test_modeling_vit_mae.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore ViTMAE model. """
-
-
-import math
-import tempfile
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import ViTMAEConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import Tensor
-    from mindnlp.core import nn
-
-
-    from mindnlp.transformers import ViTMAEForPreTraining, ViTMAEModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import ViTImageProcessor
-
-
-class ViTMAEModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        num_labels=3,
-        mask_ratio=0.6,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.mask_ratio = mask_ratio
-        self.scope = scope
-
-        # in ViTMAE, the expected sequence length = (num_patches + 1) * (1 - config.mask_ratio), rounded above
-        # (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = int(math.ceil((1 - mask_ratio) * (num_patches + 1)))
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return ViTMAEConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            mask_ratio=self.mask_ratio,
-            decoder_hidden_size=self.hidden_size,
-            decoder_intermediate_size=self.intermediate_size,
-            decoder_num_attention_heads=self.num_attention_heads,
-            decoder_num_hidden_layers=self.num_hidden_layers,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = ViTMAEModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_pretraining(self, config, pixel_values, labels):
-        model = ViTMAEForPreTraining(config)
-        model.set_train(False)
-        result = model(pixel_values)
-        num_patches = (self.image_size // self.patch_size) ** 2
-        expected_num_channels = self.patch_size**2 * self.num_channels
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, num_patches, expected_num_channels))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = ViTMAEForPreTraining(config)
-        model.set_train(False)
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        expected_num_channels = self.patch_size**2
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, num_patches, expected_num_channels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ViTMAEModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ViTMAE does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (ViTMAEForPreTraining, ) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"image-feature-extraction": ViTMAEModel} if is_mindspore_available() else {}
-
-    test_pruning = False
-    test_torchscript = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = ViTMAEModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ViTMAEConfig, has_text_modality=False, hidden_size=37)
-
-    @unittest.skip(reason="ViTMAE does not test")
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="ViTMAE does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_pretraining(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
-
-    # overwrite from common since ViTMAEForPretraining has random masking, we need to fix the noise
-    # to generate masks during test
-    def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
-        # make masks reproducible
-        np.random.seed(2)
-
-        num_patches = int((pt_model.config.image_size // pt_model.config.patch_size) ** 2)
-        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
-        pt_noise = Tensor.from_numpy(noise)
-
-        # Add `noise` argument.
-        # PT inputs will be prepared in `super().check_pt_tf_models()` with this added `noise` argument
-        pt_inputs_dict["noise"] = pt_noise
-
-        super().check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
-
-    def test_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.set_train(False)
-            # make random mask reproducible
-            mindspore.set_seed(2)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            out_2 = outputs[0].asnumpy()
-            out_2[np.isnan(out_2)] = 0
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, from_pt = True)
-                # make random mask reproducible
-                mindspore.set_seed(2)
-                after_outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-                # Make sure we don't have nans
-                out_1 = after_outputs[0].asnumpy()
-                out_1[np.isnan(out_1)] = 0
-                max_diff = np.amax(np.abs(out_1 - out_2))
-                self.assertLessEqual(max_diff, 1e-5)
-
-    @unittest.skip(
-        reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load
-    to get deterministic results."""
-    )
-    def test_determinism(self):
-        pass
-
-    @unittest.skip(
-        reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load
-    to get deterministic results."""
-    )
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(
-        reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load
-    to get deterministic results."""
-    )
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load""")
-    def test_model_outputs_equivalence(self):
-        pass
-
-    @unittest.skip(reason="ViTMAE returns a random mask + ids_restore in each forward pass")
-    def test_batching_equivalence(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/vit-base-patch16-224"
-        model = ViTMAEModel.from_pretrained(model_name, from_pt = True)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class ViTMAEModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return ViTImageProcessor.from_pretrained("facebook/vit-mae-base", from_pt = True) if is_vision_available() else None
-
-    @slow
-    def test_inference_for_pretraining(self):
-        # make random mask reproducible across the PT and TF model
-        np.random.seed(2)
-
-        model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base", from_pt = True)
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-        # prepare a noise vector that will be also used for testing the TF model
-        # (this way we can ensure that the PT and TF models operate on the same inputs)
-        vit_mae_config = ViTMAEConfig()
-        num_patches = int((vit_mae_config.image_size // vit_mae_config.patch_size) ** 2)
-        noise = np.random.uniform(size=(1, num_patches))
-
-        # forward pass
-        outputs = model(**inputs, noise=Tensor.from_numpy(noise))
-
-        # verify the logits
-        expected_shape = (1, 196, 768)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[-0.0548, -1.7023, -0.9325], [0.3721, -0.5670, -0.2233], [0.8235, -1.3878, -0.3524]]
-        )
-        self.assertTrue(np.allclose(outputs.logits[0, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
diff --git a/tests/transformers/models/vit_msn/__init__.py b/tests/transformers/models/vit_msn/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/vit_msn/test_modeling_vit_msn.py b/tests/transformers/models/vit_msn/test_modeling_vit_msn.py
deleted file mode 100644
index 832623f6f..000000000
--- a/tests/transformers/models/vit_msn/test_modeling_vit_msn.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the Mindspore ViTMSN model."""
-
-import unittest
-import numpy as np
-
-from mindnlp.transformers import ViTMSNConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import ViTMSNForImageClassification, ViTMSNModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import ViTImageProcessor
-
-
-class ViTMSNModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        scope=None,
-        attn_implementation="eager",
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.attn_implementation = attn_implementation
-
-        # in ViT MSN, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return ViTMSNConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-            attn_implementation=self.attn_implementation,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = ViTMSNModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = ViTMSNForImageClassification(config)
-        model.set_train(False)
-        result = model(pixel_values, labels=labels)
-        print("Pixel and labels shape: {pixel_values.shape}, {labels.shape}")
-        print("Labels: {labels}")
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = ViTMSNForImageClassification(config)
-        model.set_train(False)
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class ViTMSNModelTest(ModelTesterMixin,  unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as ViTMSN does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (ViTMSNForImageClassification,) if is_mindspore_available() else ()
-
-    test_pruning = False
-    test_torchscript = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = ViTMSNModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=ViTMSNConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="ViTMSN does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "facebook/vit-msn-small"
-        model = ViTMSNModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class ViTMSNModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return ViTImageProcessor.from_pretrained("facebook/vit-msn-small") if is_vision_available() else None
-
-    @slow
-    @unittest.skip(reason="The random number generation of pytorch and mindspore is inconsistent")
-    def test_inference_image_classification_head(self):
-        ms.set_seed(2)
-        model = ViTMSNForImageClassification.from_pretrained("facebook/vit-msn-small")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape =(1, 1000)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = ms.tensor([0.5588, 0.6853, -0.5929])
-        print(outputs.logits[0, :3].asnumpy())
-        self.assertTrue(np.allclose(outputs.logits[0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-3))
\ No newline at end of file
diff --git a/tests/transformers/models/vitdet/__init__.py b/tests/transformers/models/vitdet/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/vitdet/test_modeling_vitdet.py b/tests/transformers/models/vitdet/test_modeling_vitdet.py
deleted file mode 100644
index 2890317d4..000000000
--- a/tests/transformers/models/vitdet/test_modeling_vitdet.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore ViTDet model."""
-
-import unittest
-
-from mindnlp.transformers import VitDetConfig
-from mindnlp.utils.testing_utils import is_flaky, require_mindspore, is_mindspore_available
-
-from ...test_backbone_common import BackboneTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import VitDetBackbone, VitDetModel
-
-
-class VitDetModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        self.num_patches_one_direction = self.image_size // self.patch_size
-        self.seq_length = (self.image_size // self.patch_size) ** 2
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return VitDetConfig(
-            image_size=self.image_size,
-            pretrain_image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = VitDetModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.hidden_size, self.num_patches_one_direction, self.num_patches_one_direction),
-        )
-
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = VitDetBackbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify hidden states
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(
-            list(result.feature_maps[0].shape),
-            [self.batch_size, self.hidden_size, self.num_patches_one_direction, self.num_patches_one_direction],
-        )
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-        self.parent.assertListEqual(model.channels, [config.hidden_size])
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = VitDetBackbone(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(
-            list(result.feature_maps[0].shape),
-            [self.batch_size, self.hidden_size, self.num_patches_one_direction, self.num_patches_one_direction],
-        )
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-        self.parent.assertListEqual(model.channels, [config.hidden_size])
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class VitDetModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as VitDet does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (VitDetModel, VitDetBackbone) if is_mindspore_available() else ()
-    pipeline_model_mapping = {"feature-extraction": VitDetModel} if is_mindspore_available() else {}
-
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = VitDetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=VitDetConfig, has_text_modality=False, hidden_size=37)
-
-    @is_flaky(max_attempts=3, description="`torch.nn.init.trunc_normal_` is flaky.")
-    def test_initialization(self):
-        super().test_initialization()
-
-    # TODO: Fix me (once this model gets more usage)
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
-    def test_cpu_offload(self):
-        super().test_cpu_offload()
-
-    # TODO: Fix me (once this model gets more usage)
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
-    def test_disk_offload_bin(self):
-        super().test_disk_offload()
-
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
-    def test_disk_offload_safetensors(self):
-        super().test_disk_offload()
-
-    # TODO: Fix me (once this model gets more usage)
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
-    def test_model_parallelism(self):
-        super().test_model_parallelism()
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="VitDet does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Dense))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_backbone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_backbone(*config_and_inputs)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-
-            expected_num_stages = self.model_tester.num_hidden_layers
-            self.assertEqual(len(hidden_states), expected_num_stages + 1)
-
-            # VitDet's feature maps are of shape (batch_size, num_channels, height, width)
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [
-                    self.model_tester.num_patches_one_direction,
-                    self.model_tester.num_patches_one_direction,
-                ],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    # overwrite since VitDet only supports retraining gradients of hidden states
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = self.has_attentions
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        inputs = self._prepare_for_class(inputs_dict, model_class)
-
-        outputs = model(**inputs)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        #hidden_states.retain_grad()
-
-        #output.flatten()[0].backward(retain_graph=True)
-
-        #self.assertIsNotNone(hidden_states.grad)
-
-    @unittest.skip(reason="VitDet does not support feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip(reason="VitDet does not have standalone checkpoints since it used as backbone in other models")
-    def test_model_from_pretrained(self):
-        pass
-    
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Dense))
-
-@require_mindspore
-class VitDetBackboneTest(unittest.TestCase, BackboneTesterMixin):
-    all_model_classes = (VitDetBackbone,) if is_mindspore_available() else ()
-    config_class = VitDetConfig
-
-    has_attentions = False
-
-    def setUp(self):
-        self.model_tester = VitDetModelTester(self)
diff --git a/tests/transformers/models/vitmatte/__init__.py b/tests/transformers/models/vitmatte/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/vitmatte/test_image_processing_vitmatte.py b/tests/transformers/models/vitmatte/test_image_processing_vitmatte.py
deleted file mode 100644
index 231485a96..000000000
--- a/tests/transformers/models/vitmatte/test_image_processing_vitmatte.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from mindnlp.utils import is_vision_available
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, is_mindspore_available
-
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_mindspore_available():
-    import mindspore
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import VitMatteImageProcessor
-
-
-class VitMatteImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_rescale=True,
-        rescale_factor=0.5,
-        do_pad=True,
-        size_divisibility=10,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_pad = do_pad
-        self.size_divisibility = size_divisibility
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-
-    def prepare_image_processor_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_pad": self.do_pad,
-            "size_divisibility": self.size_divisibility,
-        }
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class VitMatteImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = VitMatteImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = VitMatteImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_rescale"))
-        self.assertTrue(hasattr(image_processing, "rescale_factor"))
-        self.assertTrue(hasattr(image_processing, "do_pad"))
-        self.assertTrue(hasattr(image_processing, "size_divisibility"))
-
-    def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input (image processor does not support batched inputs)
-        image = image_inputs[0]
-        trimap = np.random.randint(0, 3, size=image.shape[:2])
-        encoded_images = image_processing(images=image, trimaps=trimap, return_tensors="ms").pixel_values
-
-        # Verify that width and height can be divided by size_divisibility
-        self.assertTrue(encoded_images.shape[-1] % self.image_processor_tester.size_divisibility == 0)
-        self.assertTrue(encoded_images.shape[-2] % self.image_processor_tester.size_divisibility == 0)
-
-    def test_call_pytorch(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
-
-        for image in image_inputs:
-            self.assertIsInstance(image, mindspore.Tensor)
-
-        # Test not batched input (image processor does not support batched inputs)
-        image = image_inputs[0]
-        trimap = np.random.randint(0, 3, size=image.shape[:2])
-        encoded_images = image_processing(images=image, trimaps=trimap, return_tensors="ms").pixel_values
-
-        # Verify that width and height can be divided by size_divisibility
-        self.assertTrue(encoded_images.shape[-1] % self.image_processor_tester.size_divisibility == 0)
-        self.assertTrue(encoded_images.shape[-2] % self.image_processor_tester.size_divisibility == 0)
-
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input (image processor does not support batched inputs)
-        image = image_inputs[0]
-        trimap = np.random.randint(0, 3, size=image.size[::-1])
-        encoded_images = image_processing(images=image, trimaps=trimap, return_tensors="ms").pixel_values
-
-        # Verify that width and height can be divided by size_divisibility
-        self.assertTrue(encoded_images.shape[-1] % self.image_processor_tester.size_divisibility == 0)
-        self.assertTrue(encoded_images.shape[-2] % self.image_processor_tester.size_divisibility == 0)
-
-    def test_call_numpy_4_channels(self):
-        # Test that can process images which have an arbitrary number of channels
-        # Initialize image_processing
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-
-        # create random numpy tensors
-        self.image_processor_tester.num_channels = 4
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-
-        # Test not batched input (image processor does not support batched inputs)
-        image = image_inputs[0]
-        trimap = np.random.randint(0, 3, size=image.shape[:2])
-        encoded_images = image_processor(
-            images=image,
-            trimaps=trimap,
-            input_data_format="channels_first",
-            image_mean=0,
-            image_std=1,
-            return_tensors="ms",
-        ).pixel_values
-
-        # Verify that width and height can be divided by size_divisibility
-        self.assertTrue(encoded_images.shape[-1] % self.image_processor_tester.size_divisibility == 0)
-        self.assertTrue(encoded_images.shape[-2] % self.image_processor_tester.size_divisibility == 0)
-
-    def test_padding(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        image = np.random.randn(3, 249, 491)
-        images = image_processing.pad_image(image)
-        assert images.shape == (3, 256, 512)
-
-        image = np.random.randn(3, 249, 512)
-        images = image_processing.pad_image(image)
-        assert images.shape == (3, 256, 512)
diff --git a/tests/transformers/models/vitmatte/test_modeling_vitmatte.py b/tests/transformers/models/vitmatte/test_modeling_vitmatte.py
deleted file mode 100644
index c14b089d4..000000000
--- a/tests/transformers/models/vitmatte/test_modeling_vitmatte.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch VitMatte model."""
-
-import unittest
-
-from huggingface_hub import hf_hub_download
-
-from mindnlp.core import nn, ops
-from mindnlp.transformers import VitMatteConfig
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available, 
-    require_mindspore,
-    slow,
-)
-from mindnlp.utils import is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-
-    from mindnlp.transformers import VitDetConfig, VitMatteForImageMatting
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import VitMatteImageProcessor
-
-
-class VitMatteModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=32,
-        patch_size=16,
-        num_channels=4,
-        is_training=True,
-        use_labels=False,
-        hidden_size=2,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        hidden_act="gelu",
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        scope=None,
-        out_features=["stage1"],
-        fusion_hidden_sizes=[128, 64, 32, 16],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.scope = scope
-        self.out_features = out_features
-        self.fusion_hidden_sizes = fusion_hidden_sizes
-
-        self.seq_length = (self.image_size // self.patch_size) ** 2
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
-
-        labels = None
-        if self.use_labels:
-            raise NotImplementedError("Training is not yet supported")
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_backbone_config(self):
-        return VitDetConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_size=self.hidden_size,
-            is_training=self.is_training,
-            hidden_act=self.hidden_act,
-            out_features=self.out_features,
-        )
-
-    def get_config(self):
-        return VitMatteConfig(
-            backbone_config=self.get_backbone_config(),
-            backbone=None,
-            hidden_size=self.hidden_size,
-            fusion_hidden_sizes=self.fusion_hidden_sizes,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = VitMatteForImageMatting(config=config)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(result.alphas.shape, (self.batch_size, 1, self.image_size, self.image_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class VitMatteModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as VitMatte does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (VitMatteForImageMatting,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {}
-
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = VitMatteModelTester(self)
-        self.config_tester = ConfigTester(
-            self,
-            config_class=VitMatteConfig,
-            has_text_modality=False,
-            hidden_size=37,
-            common_properties=["hidden_size"],
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="VitMatte does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Training is not yet supported")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="Training is not yet supported")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="ViTMatte does not support input and output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "hustvl/vitmatte-small-composition-1k"
-        model = VitMatteForImageMatting.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    @unittest.skip(reason="ViTMatte does not support retaining gradient on attention logits")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [2, 2],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            print("Hello we're here")
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_backbone_selection(self):
-        def _validate_backbone_init():
-            for model_class in self.all_model_classes:
-                model = model_class(config)
-                model.eval()
-
-                if model.__class__.__name__ == "VitMatteForImageMatting":
-                    # Confirm out_indices propogated to backbone
-                    self.assertEqual(len(model.backbone.out_indices), 2)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.use_pretrained_backbone = True
-        config.backbone_config = None
-        config.backbone_kwargs = {"out_indices": [-2, -1]}
-        # Force load_backbone path
-        config.is_hybrid = False
-
-        # Load a timm backbone
-        # config.backbone = "resnet18"
-        # config.use_timm_backbone = True
-        # _validate_backbone_init()
-
-        # Load a HF backbone
-        config.backbone = "facebook/dinov2-small"
-        config.use_timm_backbone = False
-        _validate_backbone_init()
-
-
-@require_mindspore
-class VitMatteModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
-        model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k")
-
-        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
-        )
-        image = Image.open(filepath).convert("RGB")
-        filepath = hf_hub_download(
-            repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
-        )
-        trimap = Image.open(filepath).convert("L")
-
-        # prepare image + trimap for the model
-        inputs = processor(images=image, trimaps=trimap, return_tensors="ms")
-
-        alphas = model(**inputs).alphas
-
-        expected_shape = (1, 1, 640, 960)
-        self.assertEqual(alphas.shape, expected_shape)
-
-        expected_slice = mindspore.tensor(
-            [[0.9977, 0.9987, 0.9990], [0.9980, 0.9998, 0.9998], [0.9983, 0.9998, 0.9998]]
-        )
-        self.assertTrue(ops.allclose(alphas[0, 0, :3, :3], expected_slice, atol=1e-4))
diff --git a/tests/transformers/models/vits/__init__.py b/tests/transformers/models/vits/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/vits/test_modeling_vits.py b/tests/transformers/models/vits/test_modeling_vits.py
deleted file mode 100644
index 6cb1b1cdc..000000000
--- a/tests/transformers/models/vits/test_modeling_vits.py
+++ /dev/null
@@ -1,428 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch VITS model."""
-
-import copy
-import os
-import tempfile
-import unittest
-from typing import Dict, List, Tuple
-
-import numpy as np
-
-from mindnlp.transformers import PretrainedConfig, VitsConfig
-from mindnlp.utils.testing_utils import (
-    is_flaky,
-    require_mindspore,
-    is_mindspore_available,
-    slow,
-)
-from mindnlp.engine import set_seed
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    global_rng,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import VitsModel, VitsTokenizer
-
-
-CONFIG_NAME = "config.json"
-GENERATION_CONFIG_NAME = "generation_config.json"
-
-
-def _config_zero_init(config):
-    configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
-            setattr(configs_no_init, key, 1e-10)
-        if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
-            no_init_subconfig = _config_zero_init(getattr(configs_no_init, key))
-            setattr(configs_no_init, key, no_init_subconfig)
-    return configs_no_init
-
-
-@require_mindspore
-class VitsModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        seq_length=7,
-        is_training=False,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        intermediate_size=64,
-        flow_size=16,
-        vocab_size=38,
-        spectrogram_bins=8,
-        duration_predictor_num_flows=2,
-        duration_predictor_filter_channels=16,
-        prior_encoder_num_flows=2,
-        upsample_initial_channel=16,
-        upsample_rates=[8, 2],
-        upsample_kernel_sizes=[16, 4],
-        resblock_kernel_sizes=[3, 7],
-        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.flow_size = flow_size
-        self.vocab_size = vocab_size
-        self.spectrogram_bins = spectrogram_bins
-        self.duration_predictor_num_flows = duration_predictor_num_flows
-        self.duration_predictor_filter_channels = duration_predictor_filter_channels
-        self.prior_encoder_num_flows = prior_encoder_num_flows
-        self.upsample_initial_channel = upsample_initial_channel
-        self.upsample_rates = upsample_rates
-        self.upsample_kernel_sizes = upsample_kernel_sizes
-        self.resblock_kernel_sizes = resblock_kernel_sizes
-        self.resblock_dilation_sizes = resblock_dilation_sizes
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(2)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_config(self):
-        return VitsConfig(
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            ffn_dim=self.intermediate_size,
-            flow_size=self.flow_size,
-            vocab_size=self.vocab_size,
-            spectrogram_bins=self.spectrogram_bins,
-            duration_predictor_num_flows=self.duration_predictor_num_flows,
-            prior_encoder_num_flows=self.prior_encoder_num_flows,
-            duration_predictor_filter_channels=self.duration_predictor_filter_channels,
-            posterior_encoder_num_wavenet_layers=self.num_hidden_layers,
-            upsample_initial_channel=self.upsample_initial_channel,
-            upsample_rates=self.upsample_rates,
-            upsample_kernel_sizes=self.upsample_kernel_sizes,
-            resblock_kernel_sizes=self.resblock_kernel_sizes,
-            resblock_dilation_sizes=self.resblock_dilation_sizes,
-        )
-
-    def create_and_check_model_forward(self, config, inputs_dict):
-        model = VitsModel(config=config).set_train(False)
-
-        input_ids = inputs_dict["input_ids"]
-        attention_mask = inputs_dict["attention_mask"]
-
-        result = model(input_ids, attention_mask=attention_mask)
-        self.parent.assertEqual((self.batch_size, 624), result.waveform.shape)
-
-
-@require_mindspore
-class VitsModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = () if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": VitsModel, "text-to-audio": VitsModel} if is_mindspore_available() else {}
-    )
-    is_encoder_decoder = False
-    test_pruning = False
-    test_headmasking = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_torchscript = False
-    has_attentions = False
-
-    input_name = "input_ids"
-
-    def setUp(self):
-        self.model_tester = VitsModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=VitsConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="Need to fix this after #26538")
-    def test_model_forward(self):
-        set_seed(12345)
-        global_rng.seed(12345)
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_forward(*config_and_inputs)
-
-    @unittest.skip(
-        reason="require_torch_multi_gpu"
-    )
-    # @require_torch_multi_gpu
-    # override to force all elements of the batch to have the same sequence length across GPUs
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-        # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        # config.use_stochastic_duration_prediction = False
-
-        # # move input tensors to cuda:O
-        # for key, value in inputs_dict.items():
-        #     if torch.is_tensor(value):
-        #         # make all elements of the batch the same -> ensures the output seq lengths are the same for DP
-        #         value[1:] = value[0]
-        #         inputs_dict[key] = value.to(0)
-
-        # for model_class in self.all_model_classes:
-        #     model = model_class(config=config)
-        #     model.to(0)
-        #     model.eval()
-
-        #     # Wrap model in nn.DataParallel
-        #     model = torch.nn.DataParallel(model)
-        #     set_seed(555)
-        #     with torch.no_grad():
-        #         _ = model(**self._prepare_for_class(inputs_dict, model_class)).waveform
-
-    @unittest.skip(reason="VITS is not deterministic")
-    def test_determinism(self):
-        pass
-
-    @unittest.skip(reason="VITS is not deterministic")
-    def test_batching_equivalence(self):
-        pass
-
-    @is_flaky(
-        max_attempts=3,
-        description="Weight initialisation for the VITS conv layers sometimes exceeds the kaiming normal range",
-    )
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        uniform_init_parms = [
-            "emb_rel_k",
-            "emb_rel_v",
-            "conv_1",
-            "conv_2",
-            "conv_pre",
-            "conv_post",
-            "conv_proj",
-            "conv_dds",
-            "project",
-            "wavenet.in_layers",
-            "wavenet.res_skip_layers",
-            "upsampler",
-            "resblocks",
-        ]
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    @unittest.skip(reason="VITS has no inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="VITS has no input embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # override since the model is not deterministic, so we need to set the seed for each forward pass
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def set_nan_tensor_to_zero(t):
-            t[t != t] = 0
-            return t
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            # with mindspore.no_grad():
-            set_seed(0)
-            tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
-            set_seed(0)
-            dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-            def recursive_check(tuple_object, dict_object):
-                if isinstance(tuple_object, (List, Tuple)):
-                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif isinstance(tuple_object, Dict):
-                    for tuple_iterable_value, dict_iterable_value in zip(
-                        tuple_object.values(), dict_object.values()
-                    ):
-                        recursive_check(tuple_iterable_value, dict_iterable_value)
-                elif tuple_object is None:
-                    return
-                else:
-                    self.assertTrue(
-                        np.allclose(
-                            set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
-                        ),
-                        msg=(
-                            "Tuple and dict output are not equal. Difference:"
-                            f" {ops.max(ops.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                            f" {ops.isnan(tuple_object).any()} and `inf`: {ops.isinf(tuple_object)}. Dict has"
-                            f" `nan`: {ops.isnan(dict_object).any()} and `inf`: {ops.isinf(dict_object)}."
-                        ),
-                    )
-
-                recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            if self.has_attentions:
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(
-                    model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
-                )
-
-    # override since the model is not deterministic, so we need to set the seed for each forward pass
-    def test_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_save_load(out1, out2):
-            # make sure we don't have nans
-            out_2 = out2.numpy()
-            out_2[np.isnan(out_2)] = 0
-
-            out_1 = out1.numpy()
-            out_1[np.isnan(out_1)] = 0
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-            # with mindspore.no_grad():
-            set_seed(0)
-            first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                # the config file (and the generation config file, if it can generate) should be saved
-                self.assertTrue(os.path.exists(os.path.join(tmpdirname, CONFIG_NAME)))
-                self.assertEqual(
-                    model.can_generate(), os.path.exists(os.path.join(tmpdirname, GENERATION_CONFIG_NAME))
-                )
-
-                model = model_class.from_pretrained(tmpdirname, from_pt=True)
-                # with mindspore.no_grad():
-                set_seed(0)
-                second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            if isinstance(first, tuple) and isinstance(second, tuple):
-                for tensor1, tensor2 in zip(first, second):
-                    check_save_load(tensor1, tensor2)
-            else:
-                check_save_load(first, second)
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-
-
-@require_mindspore
-@slow
-@unittest.skip(reason="Need to fix")
-class VitsModelIntegrationTests(unittest.TestCase):
-    def test_forward(self):
-        # GPU gives different results than CPU
-
-        model = VitsModel.from_pretrained("facebook/mms-tts-eng", from_pt=True)
-
-        tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng", from_pt=True)
-
-        set_seed(555)  # make deterministic
-
-        input_text = "Mister quilter is the apostle of the middle classes and we are glad to welcome his gospel!"
-        input_ids = tokenizer(input_text, return_tensors="ms").input_ids
-
-        # with mindspore.no_grad():
-        outputs = model(input_ids)
-
-        self.assertEqual(outputs.waveform.shape, (1, 87040))
-        # fmt: off
-        EXPECTED_LOGITS = mindspore.tensor(
-            [
-                -0.0042,  0.0176,  0.0354,  0.0504,  0.0621,  0.0777,  0.0980,  0.1224,
-                 0.1475,  0.1679,  0.1817,  0.1832,  0.1713,  0.1542,  0.1384,  0.1256,
-                 0.1147,  0.1066,  0.1026,  0.0958,  0.0823,  0.0610,  0.0340,  0.0022,
-                -0.0337, -0.0677, -0.0969, -0.1178, -0.1311, -0.1363
-            ]
-        )
-        # fmt: on
-        self.assertTrue(np.allclose(outputs.waveform[0, 10000:10030].cpu(), EXPECTED_LOGITS, atol=1e-4))
diff --git a/tests/transformers/models/vivit/__init__.py b/tests/transformers/models/vivit/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/vivit/test_image_processing_vit.py b/tests/transformers/models/vivit/test_image_processing_vit.py
deleted file mode 100644
index fb6166ad7..000000000
--- a/tests/transformers/models/vivit/test_image_processing_vit.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from mindnlp.utils.testing_utils  import require_mindspore, require_vision
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_video_inputs
-
-
-if is_mindspore_available():
-    import mindspore
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import VivitImageProcessor
-
-
-class VivitImageProcessingTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        num_frames=10,
-        image_size=18,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        crop_size=None,
-    ):
-        super().__init__()
-        size = size if size is not None else {"shortest_edge": 18}
-        crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
-
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.num_frames = num_frames
-        self.image_size = image_size
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.crop_size = crop_size
-
-    def prepare_image_processor_dict(self):
-        return {
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_normalize": self.do_normalize,
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "crop_size": self.crop_size,
-        }
-
-    def expected_output_image_shape(self, images):
-        return self.num_frames, self.num_channels, self.crop_size["height"], self.crop_size["width"]
-
-    def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_video_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            num_frames=self.num_frames,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class VivitImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = VivitImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        super().setUp()
-        self.image_processor_tester = VivitImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "do_center_crop"))
-        self.assertTrue(hasattr(image_processing, "size"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 18})
-        self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18})
-
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84)
-        self.assertEqual(image_processor.size, {"shortest_edge": 42})
-        self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84})
-
-    def test_rescale(self):
-        # ViVit optionally rescales between -1 and 1 instead of the usual 0 and 1
-        image = np.arange(0, 256, 1, dtype=np.uint8).reshape(1, 8, 32)
-
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-
-        rescaled_image = image_processor.rescale(image, scale=1 / 127.5)
-        expected_image = (image * (1 / 127.5)).astype(np.float32) - 1
-        self.assertTrue(np.allclose(rescaled_image, expected_image))
-
-        rescaled_image = image_processor.rescale(image, scale=1 / 255, offset=False)
-        expected_image = (image / 255.0).astype(np.float32)
-        self.assertTrue(np.allclose(rescaled_image, expected_image))
-
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL videos
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], Image.Image)
-
-        # Test not batched input
-        encoded_videos = image_processing(video_inputs[0], return_tensors="ms").pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
-        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
-
-        # Test batched
-        encoded_videos = image_processing(video_inputs, return_tensors="ms").pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
-        self.assertEqual(
-            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
-        )
-
-    def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], np.ndarray)
-
-        # Test not batched input
-        encoded_videos = image_processing(video_inputs[0], return_tensors="ms").pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
-        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
-
-        # Test batched
-        encoded_videos = image_processing(video_inputs, return_tensors="ms").pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
-        self.assertEqual(
-            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
-        )
-
-    def test_call_numpy_4_channels(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        self.image_processor_tester.num_channels = 4
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], np.ndarray)
-
-        # Test not batched input
-        encoded_videos = image_processing(
-            video_inputs[0], return_tensors="ms", image_mean=0, image_std=1, input_data_format="channels_first"
-        ).pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
-        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
-
-        # Test batched
-        encoded_videos = image_processing(
-            video_inputs, return_tensors="ms", image_mean=0, image_std=1, input_data_format="channels_first"
-        ).pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
-        self.assertEqual(
-            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
-        )
-        self.image_processor_tester.num_channels = 3
-
-    def test_call_pytorch(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, torchify=True)
-        for video in video_inputs:
-            self.assertIsInstance(video, list)
-            self.assertIsInstance(video[0], mindspore.Tensor)
-
-        # Test not batched input
-        encoded_videos = image_processing(video_inputs[0], return_tensors="ms").pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape([encoded_videos[0]])
-        self.assertEqual(tuple(encoded_videos.shape), (1, *expected_output_video_shape))
-
-        # Test batched
-        encoded_videos = image_processing(video_inputs, return_tensors="ms").pixel_values
-        expected_output_video_shape = self.image_processor_tester.expected_output_image_shape(encoded_videos)
-        self.assertEqual(
-            tuple(encoded_videos.shape), (self.image_processor_tester.batch_size, *expected_output_video_shape)
-        )
diff --git a/tests/transformers/models/vivit/test_modeling_vivit.py b/tests/transformers/models/vivit/test_modeling_vivit.py
deleted file mode 100644
index 18d142928..000000000
--- a/tests/transformers/models/vivit/test_modeling_vivit.py
+++ /dev/null
@@ -1,361 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the Mindspore ViViT model."""
-
-import copy
-import inspect
-import unittest
-
-import numpy as np
-from huggingface_hub import hf_hub_download
-
-from mindnlp.transformers import VivitConfig
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-from mindnlp.transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn
-    from mindnlp.transformers import VivitForVideoClassification, VivitModel
-
-if is_vision_available():
-    from mindnlp.transformers import VivitImageProcessor
-
-
-class VivitModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=2,
-        is_training=True,
-        use_labels=True,
-        num_labels=10,
-        image_size=10,
-        num_frames=8,  # decreased, because default 32 takes too much RAM at inference
-        tubelet_size=[2, 4, 4],
-        num_channels=3,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu_fast",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
-        initializer_range=0.02,
-        layer_norm_eps=1e-06,
-        qkv_bias=True,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.num_labels = num_labels
-        self.image_size = image_size
-        self.num_frames = num_frames
-        self.tubelet_size = tubelet_size
-        self.num_channels = num_channels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.qkv_bias = qkv_bias
-        self.scope = scope
-
-        self.seq_length = (
-            (self.image_size // self.tubelet_size[2])
-            * (self.image_size // self.tubelet_size[1])
-            * (self.num_frames // self.tubelet_size[0])
-        ) + 1  # CLS token
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [self.batch_size, self.num_frames, self.num_channels, self.image_size, self.image_size]
-        )
-
-        labels = None
-        if self.use_labels:
-            labels = ids_tensor([self.batch_size], self.num_labels)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        config = VivitConfig(
-            num_frames=self.num_frames,
-            image_size=self.image_size,
-            tubelet_size=self.tubelet_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            initializer_range=self.initializer_range,
-            layer_norm_eps=self.layer_norm_eps,
-            qkv_bias=self.qkv_bias,
-        )
-        config.num_labels = self.num_labels
-        return config
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = VivitModel(config=config)
-        model.eval()
-        result = model(pixel_values)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_video_classification(self, config, pixel_values, labels):
-        model = VivitForVideoClassification(config)
-        model.eval()
-
-        result = model(pixel_values)
-
-        # verify the logits shape
-        expected_shape = (self.batch_size, self.num_labels)
-        self.parent.assertEqual(result.logits.shape, expected_shape)
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class VivitModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as Vivit does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (VivitModel, VivitForVideoClassification) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"feature-extraction": VivitModel, "video-classification": VivitForVideoClassification}
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_torchscript = False
-
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = VivitModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=VivitConfig, has_text_modality=False, hidden_size=37)
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-        if return_labels:
-            if model_class in get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING):
-                inputs_dict["labels"] = mindspore.ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-        return inputs_dict
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="Vivit does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values", "head_mask"]
-            self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_for_video_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_video_classification(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "google/vivit-b-16x2-kinetics400"
-        model = VivitModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        for model_class in self.all_model_classes:
-            seq_len = self.model_tester.seq_length
-
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_len, seq_len],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            self.assertEqual(out_len + 1, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_len, seq_len],
-            )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.hidden_states
-            expected_num_layers = self.model_tester.num_hidden_layers + 1
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-
-# We will verify our results on a video of eating spaghetti
-# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
-def prepare_video():
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti_32_frames.npy", repo_type="dataset"
-    )
-    video = np.load(file)
-    return list(video)
-
-
-@require_mindspore
-@require_vision
-class VivitModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return VivitImageProcessor() if is_vision_available() else None
-
-    @slow
-    def test_inference_for_video_classification(self):
-        model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400")
-
-        image_processor = self.default_image_processor
-        video = prepare_video()
-        inputs = image_processor(video, return_tensors="ms")
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        expected_shape = (1, 400)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        # taken from original model
-        expected_slice = mindspore.tensor([-0.9498, 2.7971, -1.4049, 0.1024, -1.8353]).asnumpy()
-        model_output = outputs.logits[0, :5].asnumpy()
-        self.assertTrue(np.allclose(model_output, expected_slice, rtol=1e-2, atol=1e-4))
-
-    @slow
-    def test_inference_interpolate_pos_encoding(self):
-        # Vivit models have an `interpolate_pos_encoding` argument in their forward method,
-        # allowing to interpolate the pre-trained position embeddings in order to use
-        # the model on higher resolutions. The DINO model by Facebook AI leverages this
-        # to visualize self-attention on higher resolution images.
-        model = VivitModel.from_pretrained("google/vivit-b-16x2")
-
-        image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2")
-        video = prepare_video()
-        inputs = image_processor(
-            video, size={"shortest_edge": 480}, crop_size={"height": 480, "width": 480}, return_tensors="ms"
-        )
-        pixel_values = inputs.pixel_values
-
-        # forward pass
-        outputs = model(pixel_values, interpolate_pos_encoding=True)
-
-        # verify the logits shape
-        expected_shape = (1, 3137, 768)
-        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
diff --git a/tests/transformers/models/wav2vec2/__init__.py b/tests/transformers/models/wav2vec2/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/wav2vec2/test_feature_extraction_wav2vec2.py b/tests/transformers/models/wav2vec2/test_feature_extraction_wav2vec2.py
deleted file mode 100644
index e18dc2c50..000000000
--- a/tests/transformers/models/wav2vec2/test_feature_extraction_wav2vec2.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import itertools
-import random
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import Wav2Vec2Config, Wav2Vec2FeatureExtractor
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
-
-
-global_rng = random.Random()
-
-
-# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-
-class Wav2Vec2FeatureExtractionTester(unittest.TestCase):
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        min_seq_length=400,
-        max_seq_length=2000,
-        feature_size=1,
-        padding_value=0.0,
-        sampling_rate=16000,
-        return_attention_mask=True,
-        do_normalize=True,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.min_seq_length = min_seq_length
-        self.max_seq_length = max_seq_length
-        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
-        self.feature_size = feature_size
-        self.padding_value = padding_value
-        self.sampling_rate = sampling_rate
-        self.return_attention_mask = return_attention_mask
-        self.do_normalize = do_normalize
-
-    def prepare_feat_extract_dict(self):
-        return {
-            "feature_size": self.feature_size,
-            "padding_value": self.padding_value,
-            "sampling_rate": self.sampling_rate,
-            "return_attention_mask": self.return_attention_mask,
-            "do_normalize": self.do_normalize,
-        }
-
-    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
-        def _flatten(list_of_lists):
-            return list(itertools.chain(*list_of_lists))
-
-        if equal_length:
-            speech_inputs = floats_list((self.batch_size, self.max_seq_length))
-        else:
-            # make sure that inputs increase in size
-            speech_inputs = [
-                _flatten(floats_list((x, self.feature_size)))
-                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
-            ]
-
-        if numpify:
-            speech_inputs = [np.asarray(x) for x in speech_inputs]
-
-        return speech_inputs
-
-
-class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-    feature_extraction_class = Wav2Vec2FeatureExtractor
-
-    def setUp(self):
-        self.feat_extract_tester = Wav2Vec2FeatureExtractionTester(self)
-
-    def _check_zero_mean_unit_variance(self, input_vector):
-        self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3))
-        self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3))
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        # create three inputs of length 800, 1000, and 1200
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        # Test not batched input
-        encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values
-        encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values
-        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
-
-        # Test batched
-        encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
-        encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-        # Test 2-D numpy arrays are batched.
-        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
-        np_speech_inputs = np.asarray(speech_inputs)
-        encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
-        encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-    def test_zero_mean_unit_variance_normalization_np(self):
-        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-
-        paddings = ["longest", "max_length", "do_not_pad"]
-        max_lengths = [None, 1600, None]
-        for max_length, padding in zip(max_lengths, paddings):
-            processed = feat_extract(speech_inputs, padding=padding, max_length=max_length, return_tensors="np")
-            input_values = processed.input_values
-
-            self._check_zero_mean_unit_variance(input_values[0][:800])
-            self.assertTrue(input_values[0][800:].sum() < 1e-6)
-            self._check_zero_mean_unit_variance(input_values[1][:1000])
-            self.assertTrue(input_values[0][1000:].sum() < 1e-6)
-            self._check_zero_mean_unit_variance(input_values[2][:1200])
-
-    def test_zero_mean_unit_variance_normalization(self):
-        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        lengths = range(800, 1400, 200)
-        speech_inputs = [floats_list((1, x))[0] for x in lengths]
-
-        paddings = ["longest", "max_length", "do_not_pad"]
-        max_lengths = [None, 1600, None]
-
-        for max_length, padding in zip(max_lengths, paddings):
-            processed = feat_extract(speech_inputs, max_length=max_length, padding=padding)
-            input_values = processed.input_values
-
-            self._check_zero_mean_unit_variance(input_values[0][:800])
-            self._check_zero_mean_unit_variance(input_values[1][:1000])
-            self._check_zero_mean_unit_variance(input_values[2][:1200])
-
-    def test_zero_mean_unit_variance_normalization_trunc_np_max_length(self):
-        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        processed = feat_extract(
-            speech_inputs, truncation=True, max_length=1000, padding="max_length", return_tensors="np"
-        )
-        input_values = processed.input_values
-
-        self._check_zero_mean_unit_variance(input_values[0, :800])
-        self._check_zero_mean_unit_variance(input_values[1])
-        self._check_zero_mean_unit_variance(input_values[2])
-
-    def test_zero_mean_unit_variance_normalization_trunc_np_longest(self):
-        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        processed = feat_extract(
-            speech_inputs, truncation=True, max_length=1000, padding="longest", return_tensors="np"
-        )
-        input_values = processed.input_values
-
-        self._check_zero_mean_unit_variance(input_values[0, :800])
-        self._check_zero_mean_unit_variance(input_values[1, :1000])
-        self._check_zero_mean_unit_variance(input_values[2])
-
-        # make sure that if max_length < longest -> then pad to max_length
-        self.assertTrue(input_values.shape == (3, 1000))
-
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        processed = feat_extract(
-            speech_inputs, truncation=True, max_length=2000, padding="longest", return_tensors="np"
-        )
-        input_values = processed.input_values
-
-        self._check_zero_mean_unit_variance(input_values[0, :800])
-        self._check_zero_mean_unit_variance(input_values[1, :1000])
-        self._check_zero_mean_unit_variance(input_values[2])
-
-        # make sure that if max_length > longest -> then pad to longest
-        self.assertTrue(input_values.shape == (3, 1200))
-
-    @require_mindspore
-    def test_double_precision_pad(self):
-        import mindspore
-
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        np_speech_inputs = np.random.rand(100).astype(np.float64)
-        py_speech_inputs = np_speech_inputs.tolist()
-
-        for inputs in [py_speech_inputs, np_speech_inputs]:
-            np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np")
-            self.assertTrue(np_processed.input_values.dtype == np.float32)
-            pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="ms")
-            self.assertTrue(pt_processed.input_values.dtype == mindspore.float32)
-
-    @slow
-    @require_mindspore
-    def test_pretrained_checkpoints_are_set_correctly(self):
-        # this test makes sure that models that are using
-        # group norm don't have their feature extractor return the
-        # attention_mask
-        model_id = "facebook/wav2vec2-base-960h"
-        config = Wav2Vec2Config.from_pretrained(model_id)
-        feat_extract = Wav2Vec2FeatureExtractor.from_pretrained(model_id)
-
-        # only "layer" feature extraction norm should make use of
-        # attention_mask
-        self.assertEqual(feat_extract.return_attention_mask, config.feat_extract_norm == "layer")
\ No newline at end of file
diff --git a/tests/transformers/models/wav2vec2/test_modeling_wav2vec2.py b/tests/transformers/models/wav2vec2/test_modeling_wav2vec2.py
deleted file mode 100644
index 172f09ec7..000000000
--- a/tests/transformers/models/wav2vec2/test_modeling_wav2vec2.py
+++ /dev/null
@@ -1,1739 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Wav2Vec2 model."""
-
-import gc
-import math
-import multiprocessing
-import os
-import pickle
-import tempfile
-import traceback
-import unittest
-
-import mindspore.dataset.audio
-import numpy as np
-from datasets import load_dataset
-from pytest import mark
-
-from mindnlp.transformers import Wav2Vec2Config, is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    CaptureLogger,
-    is_pyctcdecode_available,
-    require_pyctcdecode,
-    require_soundfile,
-    require_mindspore,
-    run_test_in_subprocess,
-    slow,
-)
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore.dataset.audio import Resample
-
-    from mindnlp.core import ops, no_grad, nn
-    from mindnlp.core.nn import functional as F
-    from mindnlp.core.serialization import safe_save_file, save
-
-    from mindnlp.transformers import (
-        Wav2Vec2FeatureExtractor,
-        Wav2Vec2ForAudioFrameClassification,
-        Wav2Vec2ForCTC,
-        Wav2Vec2ForMaskedLM,
-        Wav2Vec2ForPreTraining,
-        Wav2Vec2ForSequenceClassification,
-        Wav2Vec2ForXVector,
-        Wav2Vec2Model,
-        Wav2Vec2Processor,
-    )
-    from mindnlp.transformers.models.wav2vec2.modeling_wav2vec2 import (
-        WAV2VEC2_ADAPTER_PT_FILE,
-        WAV2VEC2_ADAPTER_SAFE_FILE,
-        Wav2Vec2GumbelVectorQuantizer,
-        _compute_mask_indices,
-        _sample_negative_indices,
-    )
-
-if is_pyctcdecode_available():
-    import pyctcdecode.decoder
-
-    from mindnlp.transformers import Wav2Vec2ProcessorWithLM
-    from mindnlp.transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm
-
-
-
-def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
-    error = None
-    try:
-        _ = in_queue.get(timeout=timeout)
-
-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
-        )
-        sample = next(iter(ds))
-
-        resample = Resample(48_000, 16_000)
-        resampled_audio = resample(sample["audio"]["array"])
-
-        model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-        processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-
-        input_values = processor(resampled_audio, return_tensors="ms").input_values
-
-        with no_grad():
-            logits = model(input_values).logits
-
-        # use a spawn pool, which should trigger a warning if different than fork
-        with CaptureLogger(pyctcdecode.decoder.logger) as cl, multiprocessing.get_context("spawn").Pool(1) as pool:
-            transcription = processor.batch_decode(logits.asnumpy(), pool).text
-
-        unittest.TestCase().assertIn("Falling back to sequential decoding.", cl.out)
-        unittest.TestCase().assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
-
-        # force batch_decode to internally create a spawn pool, which should trigger a warning if different than fork
-        multiprocessing.set_start_method("spawn", force=True)
-        with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl:
-            transcription = processor.batch_decode(logits.asnumpy()).text
-
-        unittest.TestCase().assertIn("Falling back to sequential decoding.", cl.out)
-        unittest.TestCase().assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
-class Wav2Vec2ModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=1024,  # speech is longer
-        is_training=False,
-        hidden_size=16,
-        feat_extract_norm="group",
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        mask_time_prob=0.5,
-        mask_time_length=2,
-        vocab_size=32,
-        do_stable_layer_norm=False,
-        num_adapter_layers=1,
-        adapter_stride=2,
-        tdnn_dim=(32, 32),
-        tdnn_kernel=(5, 3),
-        tdnn_dilation=(1, 2),
-        xvector_output_dim=32,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.num_adapter_layers = num_adapter_layers
-        self.adapter_stride = adapter_stride
-        self.mask_time_prob = mask_time_prob
-        self.mask_time_length = mask_time_length
-        self.scope = scope
-        self.tdnn_dim = tdnn_dim
-        self.tdnn_kernel = tdnn_kernel
-        self.tdnn_dilation = tdnn_dilation
-        self.xvector_output_dim = xvector_output_dim
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-        self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length], scale=1.0)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        return config, input_values, attention_mask
-
-    def get_config(self):
-        return Wav2Vec2Config(
-            hidden_size=self.hidden_size,
-            feat_extract_norm=self.feat_extract_norm,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            mask_time_prob=self.mask_time_prob,
-            mask_time_length=self.mask_time_length,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            do_stable_layer_norm=self.do_stable_layer_norm,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            num_adapter_layers=self.num_adapter_layers,
-            adapter_stride=self.adapter_stride,
-            tdnn_dim=self.tdnn_dim,
-            tdnn_kernel=self.tdnn_kernel,
-            tdnn_dilation=self.tdnn_dilation,
-            xvector_output_dim=self.xvector_output_dim,
-        )
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = Wav2Vec2Model(config=config)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        model = Wav2Vec2Model(config=config)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter_for_ctc(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 2 * config.hidden_size
-        model = Wav2Vec2ForCTC(config=config)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.adapter_output_seq_length, self.vocab_size)
-        )
-
-    def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 8
-        model = Wav2Vec2Model(config=config)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
-        )
-
-    def create_and_check_model_with_attn_adapter(self, config, input_values, attention_mask):
-        config.adapter_attn_dim = 16
-        model = Wav2Vec2ForCTC(config=config)
-
-        self.parent.assertIsNotNone(model._get_adapters())
-
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.output_seq_length, self.vocab_size))
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = Wav2Vec2Model(config=config)
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.bool_)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(ops.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = Wav2Vec2ForCTC(config=config)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels).item() - 1), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_seq_classifier_loss(self, config, input_values, *args):
-        model = Wav2Vec2ForSequenceClassification(config=config)
-
-        # make sure that dropout is disabled
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-        unmasked_loss = model(input_values, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-
-    def check_ctc_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Wav2Vec2ForCTC(config=config)
-        model.train()
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels).item() - 2), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lengths are at least
-                # one shorter than logit lengths to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        loss.backward()
-
-    def check_seq_classifier_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Wav2Vec2ForSequenceClassification(config=config)
-        model.train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        loss.backward()
-
-    def check_xvector_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Wav2Vec2ForXVector(config=config)
-        model.train()
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = Wav2Vec2ForCTC(config)
-        model.train()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels).item() - 2), model.config.vocab_size + 100)
-
-        with self.parent.assertRaises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class Wav2Vec2ModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (Wav2Vec2ForCTC, Wav2Vec2Model, Wav2Vec2ForMaskedLM, Wav2Vec2ForSequenceClassification, Wav2Vec2ForPreTraining)
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "audio-classification": Wav2Vec2ForSequenceClassification,
-            "automatic-speech-recognition": Wav2Vec2ForCTC,
-            "feature-extraction": Wav2Vec2Model,
-            "fill-mask": Wav2Vec2ForMaskedLM,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = True
-    test_pruning = False
-    test_headmasking = False
-
-    def setUp(self):
-        self.model_tester = Wav2Vec2ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
-
-    def test_model_with_adapter_for_ctc(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_for_ctc(*config_and_inputs)
-
-    def test_model_with_adapter_proj_dim(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_xvector_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_xvector_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    @unittest.skip(reason="Model has no inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Model has input_values instead of input_ids")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip(reason="Model has no tokens embeds")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Model has no inputs_embeds")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            nn.init.constant_(module.weight, 3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            nn.init.constant_(module.weight_g, 3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            nn.init.constant_(module.weight_v, 3)
-        if hasattr(module, "bias") and module.bias is not None:
-            nn.init.constant_(module.bias, 3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            nn.init.constant_(module.codevectors, 3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            nn.init.constant_(module.masked_spec_embed, 3)
-
-    def test_mask_feature_prob_ctc(self):
-        model = Wav2Vec2ForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", mask_feature_prob=0.2, mask_feature_length=2
-        )
-        model.train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = Wav2Vec2ForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", mask_time_prob=0.2, mask_time_length=2
-        )
-        model.train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class Wav2Vec2RobustModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            Wav2Vec2ForCTC,
-            Wav2Vec2Model,
-            Wav2Vec2ForMaskedLM,
-            Wav2Vec2ForSequenceClassification,
-            Wav2Vec2ForPreTraining,
-            Wav2Vec2ForAudioFrameClassification,
-            Wav2Vec2ForXVector,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    test_pruning = False
-    test_headmasking = False
-
-    def setUp(self):
-        self.model_tester = Wav2Vec2ModelTester(
-            self, conv_stride=(3, 3, 3), feat_extract_norm="layer", do_stable_layer_norm=True
-        )
-        self.config_tester = ConfigTester(self, config_class=Wav2Vec2Config, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
-
-    def test_model_with_adapter_proj_dim(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
-
-    def test_model_with_attn_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_attn_adapter(*config_and_inputs)
-
-    def test_batched_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_batch_inference(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_xvector_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_xvector_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    @unittest.skip(reason="Model has no input_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Model has input_values instead of input_ids")
-    def test_forward_signature(self):
-        pass
-
-    @unittest.skip(reason="Model has no token embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    @unittest.skip(reason="Model has no input_embeds")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            nn.init.constant_(module.weight, 3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            nn.init.constant_(module.weight_g, 3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            nn.init.constant_(module.weight_v, 3)
-        if hasattr(module, "bias") and module.bias is not None:
-            nn.init.constant_(module.bias, 3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            nn.init.constant_(module.codevectors, 3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            nn.init.constant_(module.masked_spec_embed, 3)
-
-    def test_model_for_pretraining(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = Wav2Vec2ForPreTraining(config)
-
-        batch_size = inputs_dict["input_values"].shape[0]
-        feature_seq_length = int(model._get_feat_extract_output_lengths(inputs_dict["input_values"].shape[1]))
-
-        features_shape = (batch_size, feature_seq_length)
-
-        mask_time_indices = _compute_mask_indices(
-            features_shape,
-            model.config.mask_time_prob,
-            model.config.mask_time_length,
-            min_masks=2,
-        )
-        sampled_negative_indices = _sample_negative_indices(features_shape, 10, mask_time_indices)
-
-        mask_time_indices = ops.from_numpy(mask_time_indices)
-        sampled_negative_indices = ops.from_numpy(sampled_negative_indices)
-
-        loss = model(
-            inputs_dict["input_values"],
-            attention_mask=inputs_dict["attention_mask"],
-            mask_time_indices=mask_time_indices,
-            sampled_negative_indices=sampled_negative_indices,
-        ).loss
-
-        # more losses
-        mask_time_indices[:, : mask_time_indices.shape[-1] // 2] = True
-
-        sampled_negative_indices = _sample_negative_indices(features_shape, 10, mask_time_indices.asnumpy())
-        sampled_negative_indices = ops.from_numpy(sampled_negative_indices)
-        loss_more_masked = model(
-            inputs_dict["input_values"],
-            attention_mask=inputs_dict["attention_mask"],
-            mask_time_indices=mask_time_indices,
-            sampled_negative_indices=sampled_negative_indices,
-        ).loss
-
-        print(loss, loss_more_masked)
-        # loss_more_masked has to be bigger or equal loss since more masked inputs have to be predicted
-        self.assertTrue(loss.item() <= loss_more_masked.item())
-
-    def test_mask_feature_prob_ctc(self):
-        model = Wav2Vec2ForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", mask_feature_prob=0.2, mask_feature_length=2
-        )
-        model.train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = Wav2Vec2ForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", mask_time_prob=0.2, mask_time_length=2
-        )
-        model.train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    def test_mask_time_feature_prob_ctc_single_batch(self):
-        model = Wav2Vec2ForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2",
-            mask_time_prob=0.2,
-            mask_feature_prob=0.2,
-            mask_time_length=2,
-            mask_feature_length=2,
-        )
-        model.train()
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (1, 1498, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    def test_load_and_set_attn_adapter(self):
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        def get_logits(model, input_features):
-            batch = processor(
-                input_features,
-                padding=True,
-                sampling_rate=processor.feature_extractor.sampling_rate,
-                return_tensors="ms",
-            )
-
-            with no_grad():
-                logits = model(
-                    input_values=batch["input_values"],
-                    attention_mask=batch["attention_mask"],
-                ).logits
-            return logits
-
-        input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
-
-        model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter", target_lang="it")
-
-        logits = get_logits(model, input_features)
-
-        model_2 = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter")
-        model_2.load_adapter("it")
-
-        logits_2 = get_logits(model_2, input_features)
-
-        self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3))
-
-    # test that loading adapter weights with mismatched vocab sizes can be loaded
-    def test_load_target_lang_with_mismatched_size(self):
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        def get_logits(model, input_features):
-            batch = processor(
-                input_features,
-                padding=True,
-                sampling_rate=processor.feature_extractor.sampling_rate,
-                return_tensors="ms",
-            )
-
-            with no_grad():
-                logits = model(
-                    input_values=batch["input_values"],
-                    attention_mask=batch["attention_mask"],
-                ).logits
-            return logits
-
-        input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
-
-        model = Wav2Vec2ForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2-adapter", target_lang="fr", ignore_mismatched_sizes=True
-        )
-
-        logits = get_logits(model, input_features)
-
-        model_2 = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter")
-        model_2.load_adapter("fr")
-
-        logits_2 = get_logits(model_2, input_features)
-
-        self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3))
-
-    def test_load_attn_adapter(self):
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
-        )
-
-        def get_logits(model, input_features):
-            batch = processor(
-                input_features,
-                padding=True,
-                sampling_rate=processor.feature_extractor.sampling_rate,
-                return_tensors="ms",
-            )
-
-            with no_grad():
-                logits = model(
-                    input_values=batch["input_values"],
-                    attention_mask=batch["attention_mask"],
-                ).logits
-            return logits
-
-        input_features = [np.random.random(16_000 * s) for s in [1, 3, 2, 6]]
-
-        model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2", adapter_attn_dim=16)
-
-        with tempfile.TemporaryDirectory() as tempdir:
-            model.save_pretrained(tempdir)
-            model = Wav2Vec2ForCTC.from_pretrained(tempdir)
-
-            logits = get_logits(model, input_features)
-            adapter_weights = model._get_adapters()
-
-            # save safe weights
-            safe_filepath = os.path.join(tempdir, WAV2VEC2_ADAPTER_SAFE_FILE.format("eng"))
-            safe_save_file(adapter_weights, safe_filepath, metadata={"format": "ms"})
-
-            model.load_adapter("eng")
-            model.load_adapter("eng", use_safetensors=True)
-
-            with self.assertRaises(OSError):
-                model.load_adapter("eng", use_safetensors=False)
-            with self.assertRaises(Exception):
-                model.load_adapter("ita", use_safetensors=True)
-            logits_2 = get_logits(model, input_features)
-
-            self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3))
-
-        with tempfile.TemporaryDirectory() as tempdir:
-            model.save_pretrained(tempdir)
-            model = Wav2Vec2ForCTC.from_pretrained(tempdir)
-
-            logits = get_logits(model, input_features)
-            adapter_weights = model._get_adapters()
-
-            # save pt weights
-            pt_filepath = os.path.join(tempdir, WAV2VEC2_ADAPTER_PT_FILE.format("eng"))
-            save(adapter_weights, pt_filepath)
-
-            model.load_adapter("eng")
-            model.load_adapter("eng", use_safetensors=False)
-
-            with self.assertRaises(OSError):
-                model.load_adapter("eng", use_safetensors=True)
-
-            logits_2 = get_logits(model, input_features)
-
-            self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3))
-
-        model = Wav2Vec2ForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2-adapter")
-        logits = get_logits(model, input_features)
-
-        model.load_adapter("eng")
-        model.load_adapter("eng", use_safetensors=False)
-        model.load_adapter("eng", use_safetensors=True)
-
-        logits_2 = get_logits(model, input_features)
-
-        self.assertTrue(ops.allclose(logits, logits_2, atol=1e-3))
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class Wav2Vec2UtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = ops.from_numpy(mask)
-
-        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
-
-    def test_compute_mask_indices_low_prob(self):
-        # with these settings num_masked_spans=0.5, which means probabilistic rounding
-        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
-        # the other 5 out of 10, cases num_masked_spans=1
-        n_trials = 100
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        count_dimensions_masked = 0
-        count_dimensions_not_masked = 0
-
-        for _ in range(n_trials):
-            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-            mask = ops.from_numpy(mask)
-
-            num_masks = ops.sum(mask).item()
-
-            if num_masks > 0:
-                count_dimensions_masked += 1
-            else:
-                count_dimensions_not_masked += 1
-
-        # as we test for at least 10 masked dimension and at least
-        # 10 non-masked dimension, this test could fail with probability:
-        # P(100 coin flips, at most 9 heads) = 1.66e-18
-        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
-        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = ops.from_numpy(mask)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-    def test_compute_mask_indices_attn_mask_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        attention_mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-        attention_mask[:2, sequence_length // 2 :] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
-        )
-        mask = ops.from_numpy(mask)
-
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
-
-    def test_compute_perplexity(self):
-        probs = ops.arange(100).reshape(2, 5, 10) / 100
-
-        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs)
-        self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3)
-
-        # mask half of the input
-        mask = ops.ones((2,), dtype=mindspore.bool_)
-        mask[0] = 0
-
-        ppl = Wav2Vec2GumbelVectorQuantizer._compute_perplexity(probs, mask)
-        self.assertTrue(abs(ppl.item() - 58.6757) < 1e-3)
-
-    def test_sample_negatives(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-        sequence = ops.div(
-            ops.arange(sequence_length * hidden_size), hidden_size, rounding_mode="floor"
-        )
-        features = sequence.view(sequence_length, hidden_size)  # each value in vector consits of same value
-        features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size))
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
-        sampled_negative_indices = ops.from_numpy(sampled_negative_indices)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertEqual(ops.unique_consecutive(negatives, dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
-    def test_sample_negatives_with_mask(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        # second half of last input tensor is padded
-        mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-        mask[-1, sequence_length // 2 :] = 0
-
-        sequence = ops.div(
-            ops.arange(sequence_length * hidden_size), hidden_size, rounding_mode="floor"
-        )
-        features = sequence.view(sequence_length, hidden_size)  # each value in vector consits of same value
-        features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size))
-
-        # replace masked feature vectors with -100 to test that those are not sampled
-        features = ops.where(mask[:, :, None].broadcast_to(features.shape).bool(), features, -100)
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices(
-            (batch_size, sequence_length), num_negatives, mask.asnumpy()
-        )
-        sampled_negative_indices = ops.from_numpy(sampled_negative_indices)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-
-        self.assertTrue((negatives >= 0).all().item())
-
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertEqual(ops.unique_consecutive(negatives, dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
-
-@require_mindspore
-@require_soundfile
-@slow
-class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test", trust_remote_code=True)
-
-        return ds[:num_samples]
-
-    def test_inference_ctc_normal(self):
-        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
-        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
-        input_speech = self._load_datasamples(1)
-
-        input_values = processor(input_speech, return_tensors="ms").input_values
-
-        with no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = ops.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_normal_batched(self):
-        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
-        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-
-        with no_grad():
-            logits = model(input_values).logits
-
-        predicted_ids = ops.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_robust_batched(self):
-        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
-        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True)
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-
-        with no_grad():
-            logits = model(input_values, attention_mask=attention_mask).logits
-
-        predicted_ids = ops.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around"
-            " him with the thousands of spectators were trivialities not worth thinking about",
-            "his instant panic was followed by a small sharp blow high on his chest",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_integration(self):
-        model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-        input_speech = self._load_datasamples(2)
-
-        inputs_dict = feature_extractor(input_speech, return_tensors="ms", padding=True)
-
-        batch_size = inputs_dict["input_values"].shape[0]
-        feature_seq_length = int(model._get_feat_extract_output_lengths(inputs_dict["input_values"].shape[1]))
-
-        features_shape = (batch_size, feature_seq_length)
-
-        np.random.seed(4)
-        mask_time_indices = _compute_mask_indices(
-            features_shape,
-            model.config.mask_time_prob,
-            model.config.mask_time_length,
-            min_masks=2,
-        )
-        mask_time_indices = ops.from_numpy(mask_time_indices)
-
-        with no_grad():
-            outputs = model(
-                inputs_dict.input_values,
-                mask_time_indices=mask_time_indices,
-            )
-
-        # compute cosine similarity
-        cosine_sim = F.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
-
-        # retrieve cosine sim of masked features
-        cosine_sim_masked = cosine_sim[mask_time_indices]
-
-        # cosine similarity of model is all > 0.5 as model is
-        # pre-trained on contrastive loss
-        # fmt: off
-        expected_cosine_sim_masked = mindspore.tensor([
-            0.8523, 0.5860, 0.6905, 0.5557, 0.7456, 0.5249, 0.6639, 0.7654, 0.7565,
-            0.8167, 0.8222, 0.7960, 0.8034, 0.8166, 0.8310, 0.8263, 0.8274, 0.8258,
-            0.8179, 0.8412, 0.8536, 0.5098, 0.4728, 0.6461, 0.4498, 0.6002, 0.5774,
-            0.6457, 0.7123, 0.5668, 0.6866, 0.4960, 0.6293, 0.7423, 0.7419, 0.7526,
-            0.7768, 0.4898, 0.5393, 0.8183
-        ])
-        # fmt: on
-
-        self.assertTrue(ops.allclose(cosine_sim_masked, expected_cosine_sim_masked, atol=1e-3))
-
-    def test_inference_pretrained(self):
-        model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-            "facebook/wav2vec2-base", return_attention_mask=True
-        )
-        input_speech = self._load_datasamples(2)
-
-        inputs_dict = feature_extractor(input_speech, return_tensors="ms", padding=True)
-
-        batch_size = inputs_dict["input_values"].shape[0]
-        feature_seq_length = int(model._get_feat_extract_output_lengths(inputs_dict["input_values"].shape[1]))
-
-        features_shape = (batch_size, feature_seq_length)
-
-        mindspore.manual_seed(0)
-        mask_time_indices = _compute_mask_indices(
-            features_shape,
-            model.config.mask_time_prob,
-            model.config.mask_time_length,
-            min_masks=2,
-        )
-        mask_time_indices = ops.from_numpy(mask_time_indices)
-
-        with no_grad():
-            outputs = model(
-                inputs_dict.input_values,
-                attention_mask=inputs_dict.attention_mask,
-                mask_time_indices=mask_time_indices,
-            )
-
-        # compute cosine similarity
-        cosine_sim = F.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
-
-        # retrieve cosine sim of masked features
-        cosine_sim_masked = cosine_sim[mask_time_indices]
-
-        # ... now compare to randomly initialized model
-
-        config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-base")
-        model_rand = Wav2Vec2ForPreTraining(config).eval()
-
-        with no_grad():
-            outputs_rand = model_rand(
-                inputs_dict.input_values,
-                attention_mask=inputs_dict.attention_mask,
-                mask_time_indices=mask_time_indices,
-            )
-
-        # compute cosine similarity
-        cosine_sim_rand = F.cosine_similarity(
-            outputs_rand.projected_states, outputs_rand.projected_quantized_states, dim=-1
-        )
-
-        # retrieve cosine sim of masked features
-        cosine_sim_masked_rand = cosine_sim_rand[mask_time_indices]
-
-        # a pretrained wav2vec2 model has learned to predict the quantized latent states
-        # => the cosine similarity between quantized states and predicted states > 0.5
-        # a random wav2vec2 model has not learned to predict the quantized latent states
-        # => the cosine similarity between quantized states and predicted states is very likely < 0.1
-        self.assertTrue(cosine_sim_masked.mean().item() - 5 * cosine_sim_masked_rand.mean().item() > 0)
-
-    def test_loss_pretraining(self):
-        model = Wav2Vec2ForPreTraining.from_pretrained(
-            "facebook/wav2vec2-base",
-            attention_dropout=0.0,
-            feat_proj_dropout=0.0,
-            hidden_dropout=0.0,
-            layerdrop=0.0,
-        )
-        model.train()
-
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-            "facebook/wav2vec2-base", return_attention_mask=True
-        )
-        input_speech = self._load_datasamples(2)
-
-        inputs_dict = feature_extractor(input_speech, return_tensors="ms", padding=True)
-
-        batch_size = inputs_dict["input_values"].shape[0]
-        feature_seq_length = int(model._get_feat_extract_output_lengths(inputs_dict["input_values"].shape[1]))
-
-        features_shape = (batch_size, feature_seq_length)
-
-        mindspore.manual_seed(0)
-        np.random.seed(0)
-
-        mask_time_indices = _compute_mask_indices(
-            features_shape,
-            model.config.mask_time_prob,
-            model.config.mask_time_length,
-            min_masks=2,
-        )
-        sampled_negative_indices = _sample_negative_indices(
-            mask_time_indices.shape, model.config.num_negatives, mask_time_indices
-        )
-
-        mask_time_indices = ops.from_numpy(mask_time_indices)
-        sampled_negative_indices = ops.from_numpy(sampled_negative_indices)
-
-        with no_grad():
-            outputs = model(
-                inputs_dict.input_values,
-                attention_mask=inputs_dict.attention_mask,
-                mask_time_indices=mask_time_indices,
-                sampled_negative_indices=sampled_negative_indices,
-            )
-
-        # check diversity loss
-        num_codevectors = model.config.num_codevectors_per_group * model.config.num_codevector_groups
-        diversity_loss = (num_codevectors - outputs.codevector_perplexity) / num_codevectors
-        self.assertTrue(abs(diversity_loss.item() - 0.9538) < 1e-3)
-
-        # check overall loss (contrastive loss + diversity loss)
-        expected_loss = 116.7094
-
-        self.assertTrue(abs(outputs.loss.item() - expected_loss) < 1e-3)
-
-    def test_inference_keyword_spotting(self):
-        model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ks")
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ks")
-        input_data = self._load_superb("ks", 4)
-        inputs = processor(input_data["speech"], return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-        with no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = ops.max(outputs.logits, dim=-1)
-
-        expected_labels = [7, 6, 10, 9]
-        # s3prl logits for the same batch
-        expected_logits = mindspore.tensor([6.1186, 11.8961, 10.2931, 6.0898])
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(ops.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_inference_intent_classification(self):
-        model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-ic")
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-ic")
-        input_data = self._load_superb("ic", 4)
-        inputs = processor(input_data["speech"], return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-        with no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-
-        predicted_logits_action, predicted_ids_action = ops.max(outputs.logits[:, :6], dim=-1)
-        predicted_logits_object, predicted_ids_object = ops.max(outputs.logits[:, 6:20], dim=-1)
-        predicted_logits_location, predicted_ids_location = ops.max(outputs.logits[:, 20:24], dim=-1)
-
-        expected_labels_action = [0, 0, 2, 3]
-        expected_logits_action = mindspore.tensor([0.4568, 11.0848, 1.6621, 9.3841])
-        expected_labels_object = [3, 10, 3, 4]
-        expected_logits_object = mindspore.tensor([1.5322, 10.7094, 5.2469, 22.1318])
-        expected_labels_location = [0, 0, 0, 1]
-        expected_logits_location = mindspore.tensor([1.5335, 6.5096, 10.5704, 11.0569])
-
-        self.assertListEqual(predicted_ids_action.tolist(), expected_labels_action)
-        self.assertListEqual(predicted_ids_object.tolist(), expected_labels_object)
-        self.assertListEqual(predicted_ids_location.tolist(), expected_labels_location)
-
-        self.assertTrue(ops.allclose(predicted_logits_action, expected_logits_action, atol=1e-2))
-        self.assertTrue(ops.allclose(predicted_logits_object, expected_logits_object, atol=1e-2))
-        self.assertTrue(ops.allclose(predicted_logits_location, expected_logits_location, atol=1e-2))
-
-    def test_inference_speaker_identification(self):
-        model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-sid")
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-sid")
-        input_data = self._load_superb("si", 4)
-
-        output_logits = []
-        with no_grad():
-            for example in input_data["speech"]:
-                input = processor(example, return_tensors="ms", padding=True)
-                output = model(input.input_values, attention_mask=None)
-                output_logits.append(output.logits[0])
-        output_logits = ops.stack(output_logits)
-        predicted_logits, predicted_ids = ops.max(output_logits, dim=-1)
-
-        expected_labels = [251, 1, 1, 3]
-        # s3prl logits for the same batch
-        expected_logits = mindspore.tensor([37.5627, 71.6362, 64.2419, 31.7778])
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(ops.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_inference_emotion_recognition(self):
-        model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er")
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")
-        input_data = self._load_superb("er", 4)
-        inputs = processor(input_data["speech"], return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-        with no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        predicted_logits, predicted_ids = ops.max(outputs.logits, dim=-1)
-
-        expected_labels = [1, 1, 2, 2]
-        # s3prl logits for the same batch
-        expected_logits = mindspore.tensor([2.1722, 3.0779, 8.0287, 6.6797])
-
-        self.assertListEqual(predicted_ids.tolist(), expected_labels)
-        self.assertTrue(ops.allclose(predicted_logits, expected_logits, atol=1e-2))
-
-    def test_phoneme_recognition(self):
-        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
-        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
-
-        input_speech = self._load_datasamples(4)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-
-        with no_grad():
-            logits = model(input_values, attention_mask=attention_mask).logits
-
-        predicted_ids = ops.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "ɐ m æ n s ɛ d t ə ð ə j uː n ɪ v ɚ s s ɚ aɪ ɛ ɡ z ɪ s t",
-            "s w ɛ t k ʌ v ɚ d b ɹ iː ɔ n z b ɑː d i t ɹ ɪ k l ɪ ŋ ɪ n t ə ð ə t aɪ t l oɪ n k l ɑː θ ð æ w ʌ z ð ɪ oʊ"
-            " n l i ɡ ɑːɹ m ə n t h iː w ɔːɹ",
-            "ð ə k aɪ t ɔ n h ɪ z tʃ ɛ s t s t ɪ l d ɹ ɪ p ɪ ŋ b l ʌ d ð ɪ eɪ k ʌ v h ɪ z oʊ v ɚ s t ɹ eɪ n d aɪ z iː"
-            " v ə n ð ə s ɔːɹ ɹ ɪ ŋ ɐ ɹ iː n ɐ ɚ ɹ aʊ n d h ɪ m w ɪ ð ə θ aʊ z ə n d z ʌ v s p ɛ k t eɪ ɾ ɚ z w ɜː t ɹ"
-            " ɪ v ɪ æ l ᵻ ɾ i z n ɑː t w ɜː θ θ ɪ ŋ k ɪ ŋ ɐ b aʊ t",
-            "h ɪ z ɪ n s t ə n t v p æ n ɪ k w ʌ z f ɑː l oʊ d b aɪ ɐ s m ɔː l ʃ ɑːɹ p b l oʊ h aɪ ɔ n h ɪ z tʃ ɛ s t",
-        ]
-        # should correspond to =>:
-        # [
-        # "a man said to the universe sir i exist",
-        # "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-        # "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about",
-        # "his instant panic was followed by a small sharp blow high on his chest",
-        # ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    @require_pyctcdecode
-    def test_wav2vec2_with_lm(self):
-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
-        )
-        sample = next(iter(ds))
-
-        resample = Resample(48_000, 16_000)
-        resampled_audio = resample(sample["audio"]["array"])
-
-        model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-        processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-
-        input_values = processor(resampled_audio, return_tensors="ms").input_values
-
-        with no_grad():
-            logits = model(input_values).logits
-
-        transcription = processor.batch_decode(logits.asnumpy()).text
-
-        self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
-
-    @require_pyctcdecode
-    def test_wav2vec2_with_lm_pool(self):
-        ds = load_dataset(
-            "mozilla-foundation/common_voice_11_0", "es", split="test", streaming=True, trust_remote_code=True
-        )
-        sample = next(iter(ds))
-
-        resample = Resample(48_000, 16_000)
-        resampled_audio = resample(sample["audio"]["array"])
-
-        model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-        processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
-
-        input_values = processor(resampled_audio, return_tensors="ms").input_values
-
-        with no_grad():
-            logits = model(input_values).logits
-
-        # test user-managed pool
-        with multiprocessing.get_context("fork").Pool(2) as pool:
-            transcription = processor.batch_decode(logits.asnumpy(), pool).text
-
-        self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
-
-        # user-managed pool + num_processes should trigger a warning
-        with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, multiprocessing.get_context("fork").Pool(
-            2
-        ) as pool:
-            transcription = processor.batch_decode(logits.asnumpy(), pool, num_processes=2).text
-
-        self.assertIn("num_process", cl.out)
-        self.assertIn("it will be ignored", cl.out)
-
-        self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
-
-    @require_pyctcdecode
-    def test_wav2vec2_with_lm_invalid_pool(self):
-        run_test_in_subprocess(test_case=self, target_func=_test_wav2vec2_with_lm_invalid_pool, inputs=None)
-
-    def test_inference_diarization(self):
-        model = Wav2Vec2ForAudioFrameClassification.from_pretrained("anton-l/wav2vec2-base-superb-sd")
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("anton-l/wav2vec2-base-superb-sd")
-        input_data = self._load_superb("sd", 4)
-        inputs = processor(input_data["speech"], return_tensors="ms", padding=True, sampling_rate=16_000)
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-        with no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        # labels is a one-hot array of shape (num_frames, num_speakers)
-        labels = (outputs.logits > 0).long()
-
-        # s3prl logits for the same batch
-        expected_logits = mindspore.tensor(
-            [
-                [[-5.2807, -5.1272], [-5.4059, -4.7757], [-5.2764, -4.9621], [-5.0117, -4.5851]],
-                [[-1.7643, -0.5462], [-1.7369, -0.2649], [-1.5066, -0.6200], [-4.5703, -2.4863]],
-                [[-0.8656, -0.4783], [-0.8899, -0.3289], [-0.9267, -0.5781], [-0.7817, -0.4619]],
-                [[-4.8625, -2.5316], [-5.2339, -2.2155], [-4.9835, -2.0344], [-4.4727, -1.8421]],
-            ],
-        )
-        self.assertEqual(labels[0, :, 0].sum(), 555)
-        self.assertEqual(labels[0, :, 1].sum(), 299)
-        self.assertTrue(ops.allclose(outputs.logits[:, :4], expected_logits, atol=1e-2))
-
-    def test_inference_speaker_verification(self):
-        model = Wav2Vec2ForXVector.from_pretrained("anton-l/wav2vec2-base-superb-sv")
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("anton-l/wav2vec2-base-superb-sv")
-        input_data = self._load_superb("si", 4)
-
-        inputs = processor(input_data["speech"], return_tensors="ms", padding=True, sampling_rate=16_000)
-        labels = mindspore.tensor([5, 1, 1, 3]).T
-
-        with no_grad():
-            input_values = inputs.input_values
-            attention_mask = inputs.attention_mask
-            outputs = model(input_values, attention_mask=attention_mask, labels=labels)
-        embeddings = nn.functional.normalize(outputs.embeddings, dim=-1)
-
-        cosine_sim = nn.CosineSimilarity(dim=-1)
-        # id10002 vs id10002
-        self.assertAlmostEqual(cosine_sim(embeddings[1], embeddings[2]).numpy(), 0.9758, 3)
-        # id10006 vs id10002
-        self.assertAlmostEqual(cosine_sim(embeddings[0], embeddings[1]).numpy(), 0.7579, 3)
-        # id10002 vs id10004
-        self.assertAlmostEqual(cosine_sim(embeddings[2], embeddings[3]).numpy(), 0.7594, 3)
-
-        self.assertAlmostEqual(outputs.loss.item(), 17.7963, 2)
-
-    def test_inference_mms_1b_all(self):
-        model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-1b-all")
-        processor = Wav2Vec2Processor.from_pretrained("facebook/mms-1b-all")
-
-        LANG_MAP = {"it": "ita", "es": "spa", "fr": "fra", "en": "eng"}
-
-        def run_model(lang):
-            ds = load_dataset(
-                "mozilla-foundation/common_voice_11_0", lang, split="test", streaming=True, trust_remote_code=True
-            )
-            sample = next(iter(ds))
-
-            wav2vec2_lang = LANG_MAP[lang]
-
-            model.load_adapter(wav2vec2_lang)
-            processor.tokenizer.set_target_lang(wav2vec2_lang)
-
-            resample = Resample(48_000, 16_000)
-            resampled_audio = resample(sample["audio"]["array"])
-
-            inputs = processor(resampled_audio, sampling_rate=16_000, return_tensors="ms")
-            input_values = inputs.input_values
-            attention_mask = inputs.attention_mask
-
-            with no_grad():
-                outputs = model(input_values, attention_mask=attention_mask).logits
-
-            ids = ops.argmax(outputs, dim=-1)[0]
-
-            transcription = processor.decode(ids)
-            return transcription
-
-        TRANSCRIPTIONS = {
-            "it": "il libro ha suscitato molte polemiche a causa dei suoi contenuti",
-            "es": "habitan aguas poco profundas y rocosas",
-            "fr": "ce dernier est volé tout au long de l'histoire romaine",
-            "en": "joe keton disapproved of films and buster also had reservations about the media",
-        }
-
-        for lang in LANG_MAP.keys():
-            assert run_model(lang) == TRANSCRIPTIONS[lang]
diff --git a/tests/transformers/models/wav2vec2/test_processor_wav2vec2.py b/tests/transformers/models/wav2vec2/test_processor_wav2vec2.py
deleted file mode 100644
index c3c28f218..000000000
--- a/tests/transformers/models/wav2vec2/test_processor_wav2vec2.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: disable=missing-class-docstring
-# pylint: disable=missing-function-docstring
-""" Testing suite for the Mindspore Wav2Vec2 processor. """
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-
-from mindnlp.transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
-from mindnlp.transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
-from mindnlp.configs import FEATURE_EXTRACTOR_NAME
-
-from .test_feature_extraction_wav2vec2 import floats_list
-
-
-class Wav2Vec2ProcessorTest(unittest.TestCase):
-    def setUp(self):
-        vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        self.add_kwargs_tokens_map = {
-            "pad_token": "<pad>",
-            "unk_token": "<unk>",
-            "bos_token": "<s>",
-            "eos_token": "</s>",
-        }
-        feature_extractor_map = {
-            "feature_size": 1,
-            "padding_value": 0.0,
-            "sampling_rate": 16000,
-            "return_attention_mask": False,
-            "do_normalize": True,
-        }
-
-        self.tmpdirname = tempfile.mkdtemp()
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-
-        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(feature_extractor_map) + "\n")
-
-    def get_tokenizer(self, **kwargs_init):
-        kwargs = self.add_kwargs_tokens_map.copy()
-        kwargs.update(kwargs_init)
-        return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_feature_extractor(self, **kwargs):
-        return Wav2Vec2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def test_save_load_pretrained_default(self):
-        tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-        processor.save_pretrained(self.tmpdirname)
-        processor = Wav2Vec2Processor.from_pretrained(self.tmpdirname)
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, Wav2Vec2FeatureExtractor)
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = Wav2Vec2Processor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
-
-        processor = Wav2Vec2Processor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, Wav2Vec2FeatureExtractor)
-
-    def test_feature_extractor(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        raw_speech = floats_list((3, 1000))
-
-        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
-        input_processor = processor(raw_speech, return_tensors="np")
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_tokenizer(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        input_str = "This is a test string"
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tok = tokenizer(input_str)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_tokenizer_decode(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        self.assertListEqual(
-            processor.model_input_names,
-            feature_extractor.model_input_names,
-            msg="`processor` and `feature_extractor` model input names do not match",
-        )
diff --git a/tests/transformers/models/wav2vec2/test_tokenization_wav2vec2.py b/tests/transformers/models/wav2vec2/test_tokenization_wav2vec2.py
deleted file mode 100644
index 7bb41d680..000000000
--- a/tests/transformers/models/wav2vec2/test_tokenization_wav2vec2.py
+++ /dev/null
@@ -1,827 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for the Wav2Vec2 tokenizer."""
-
-import inspect
-import json
-import os
-import random
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-
-from tokenizers import AddedToken
-from mindnlp.transformers import (
-    Wav2Vec2Config,
-    Wav2Vec2CTCTokenizer,
-    Wav2Vec2Tokenizer,
-)
-from mindnlp.transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizerOutput
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-global_rng = random.Random()
-
-
-# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-
-class Wav2Vec2TokenizerTest(unittest.TestCase):
-    tokenizer_class = Wav2Vec2Tokenizer
-
-    def setUp(self):
-        super().setUp()
-
-        vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
-
-        self.tmpdirname = tempfile.mkdtemp()
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return Wav2Vec2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def test_tokenizer_decode(self):
-        # TODO(PVP) - change to facebook
-        tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
-
-        sample_ids = [
-            [11, 5, 15, tokenizer.pad_token_id, 15, 8, 98],
-            [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77],
-        ]
-        tokens = tokenizer.decode(sample_ids[0])
-        batch_tokens = tokenizer.batch_decode(sample_ids)
-        self.assertEqual(tokens, batch_tokens[0])
-        self.assertEqual(batch_tokens, ["HELLO<unk>", "BYE BYE<unk>"])
-
-    def test_tokenizer_decode_special(self):
-        # TODO(PVP) - change to facebook
-        tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
-
-        sample_ids = [
-            [11, 5, 15, tokenizer.pad_token_id, 15, 8, 98],
-            [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77],
-        ]
-        sample_ids_2 = [
-            [11, 5, 5, 5, 5, 5, 15, 15, 15, tokenizer.pad_token_id, 15, 8, 98],
-            [
-                24,
-                22,
-                5,
-                tokenizer.pad_token_id,
-                tokenizer.pad_token_id,
-                tokenizer.pad_token_id,
-                tokenizer.word_delimiter_token_id,
-                24,
-                22,
-                5,
-                77,
-                tokenizer.word_delimiter_token_id,
-            ],
-        ]
-
-        batch_tokens = tokenizer.batch_decode(sample_ids)
-        batch_tokens_2 = tokenizer.batch_decode(sample_ids_2)
-        self.assertEqual(batch_tokens, batch_tokens_2)
-        self.assertEqual(batch_tokens, ["HELLO<unk>", "BYE BYE<unk>"])
-
-    def test_tokenizer_decode_added_tokens(self):
-        tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
-        tokenizer.add_tokens(["!", "?"])
-        tokenizer.add_special_tokens({"cls_token": "$$$"})
-
-        sample_ids = [
-            [
-                11,
-                5,
-                15,
-                tokenizer.pad_token_id,
-                15,
-                8,
-                98,
-                32,
-                32,
-                33,
-                tokenizer.word_delimiter_token_id,
-                32,
-                32,
-                33,
-                34,
-                34,
-            ],
-            [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77, tokenizer.pad_token_id, 34, 34],
-        ]
-        batch_tokens = tokenizer.batch_decode(sample_ids)
-        batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True)
-
-        self.assertEqual(batch_tokens, ["HELLO<unk>!? !?$$$", "BYE BYE<unk>$$$"])
-        self.assertEqual(batch_tokens_2, ["HELO!? !?", "BYE BYE"])
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        tokenizer = self.get_tokenizer()
-        # create three inputs of length 800, 1000, and 1200
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
-
-        # Test not batched input
-        encoded_sequences_1 = tokenizer(speech_inputs[0], return_tensors="np").input_values
-        encoded_sequences_2 = tokenizer(np_speech_inputs[0], return_tensors="np").input_values
-        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
-
-        # Test batched
-        encoded_sequences_1 = tokenizer(speech_inputs, return_tensors="np").input_values
-        encoded_sequences_2 = tokenizer(np_speech_inputs, return_tensors="np").input_values
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-        # Test 2-D numpy arrays are batched.
-        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
-        np_speech_inputs = np.asarray(speech_inputs)
-        encoded_sequences_1 = tokenizer(speech_inputs, return_tensors="np").input_values
-        encoded_sequences_2 = tokenizer(np_speech_inputs, return_tensors="np").input_values
-        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
-            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-
-    def test_padding(self, max_length=50):
-        def _input_values_have_equal_length(input_values):
-            length = len(input_values[0])
-            for input_values_slice in input_values[1:]:
-                if len(input_values_slice) != length:
-                    return False
-            return True
-
-        def _input_values_are_equal(input_values_1, input_values_2):
-            if len(input_values_1) != len(input_values_2):
-                return False
-
-            for input_values_slice_1, input_values_slice_2 in zip(input_values_1, input_values_2):
-                if not np.allclose(np.asarray(input_values_slice_1), np.asarray(input_values_slice_2), atol=1e-3):
-                    return False
-            return True
-
-        tokenizer = self.get_tokenizer()
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-
-        input_values_1 = tokenizer(speech_inputs).input_values
-        input_values_2 = tokenizer(speech_inputs, padding="longest").input_values
-        input_values_3 = tokenizer(speech_inputs, padding="longest", max_length=1600).input_values
-
-        self.assertFalse(_input_values_have_equal_length(input_values_1))
-        self.assertTrue(_input_values_have_equal_length(input_values_2))
-        self.assertTrue(_input_values_have_equal_length(input_values_3))
-        self.assertTrue(_input_values_are_equal(input_values_2, input_values_3))
-        self.assertTrue(len(input_values_1[0]) == 800)
-        self.assertTrue(len(input_values_2[0]) == 1200)
-        # padding should be 0.0
-        self.assertTrue(abs(sum(np.asarray(input_values_2[0])[800:])) < 1e-3)
-        self.assertTrue(abs(sum(np.asarray(input_values_2[1])[1000:])) < 1e-3)
-
-        input_values_4 = tokenizer(speech_inputs, padding="max_length").input_values
-        input_values_5 = tokenizer(speech_inputs, padding="max_length", max_length=1600).input_values
-
-        self.assertTrue(_input_values_are_equal(input_values_1, input_values_4))
-        self.assertEqual(input_values_5.shape, (3, 1600))
-        # padding should be 0.0
-        self.assertTrue(abs(sum(np.asarray(input_values_5[0])[800:1200])) < 1e-3)
-
-        input_values_6 = tokenizer(speech_inputs, pad_to_multiple_of=500).input_values
-        input_values_7 = tokenizer(speech_inputs, padding="longest", pad_to_multiple_of=500).input_values
-        input_values_8 = tokenizer(
-            speech_inputs, padding="max_length", pad_to_multiple_of=500, max_length=2400
-        ).input_values
-
-        self.assertTrue(_input_values_are_equal(input_values_1, input_values_6))
-        self.assertEqual(input_values_7.shape, (3, 1500))
-        self.assertEqual(input_values_8.shape, (3, 2500))
-        # padding should be 0.0
-        self.assertTrue(abs(sum(np.asarray(input_values_7[0])[800:])) < 1e-3)
-        self.assertTrue(abs(sum(np.asarray(input_values_7[1])[1000:])) < 1e-3)
-        self.assertTrue(abs(sum(np.asarray(input_values_7[2])[1200:])) < 1e-3)
-        self.assertTrue(abs(sum(np.asarray(input_values_8[0])[800:])) < 1e-3)
-        self.assertTrue(abs(sum(np.asarray(input_values_8[1])[1000:])) < 1e-3)
-        self.assertTrue(abs(sum(np.asarray(input_values_8[2])[1200:])) < 1e-3)
-
-    def test_save_pretrained(self):
-        pretrained_name = list(self.tokenizer_class.pretrained_vocab_files_map["vocab_file"].keys())[0]
-        tokenizer = self.tokenizer_class.from_pretrained(pretrained_name)
-        tmpdirname2 = tempfile.mkdtemp()
-
-        tokenizer_files = tokenizer.save_pretrained(tmpdirname2)
-        self.assertSequenceEqual(
-            sorted(tuple(VOCAB_FILES_NAMES.values()) + ("special_tokens_map.json", "added_tokens.json")),
-            sorted(x.split(os.path.sep)[-1] for x in tokenizer_files),
-        )
-
-        # Checks everything loads correctly in the same way
-        tokenizer_p = self.tokenizer_class.from_pretrained(tmpdirname2)
-
-        # Check special tokens are set accordingly on Rust and Python
-        for key in tokenizer.special_tokens_map:
-            self.assertTrue(key in tokenizer_p.special_tokens_map)
-
-        shutil.rmtree(tmpdirname2)
-
-    def test_get_vocab(self):
-        tokenizer = self.get_tokenizer()
-        vocab_dict = tokenizer.get_vocab()
-        self.assertIsInstance(vocab_dict, dict)
-        self.assertGreaterEqual(len(tokenizer), len(vocab_dict))
-
-        vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
-        self.assertEqual(len(vocab), len(tokenizer))
-
-        tokenizer.add_tokens(["asdfasdfasdfasdf"])
-        vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
-        self.assertEqual(len(vocab), len(tokenizer))
-
-    def test_save_and_load_tokenizer(self):
-        tokenizer = self.get_tokenizer()
-        # Isolate this from the other tests because we save additional tokens/etc
-        tmpdirname = tempfile.mkdtemp()
-
-        sample_ids = [0, 1, 4, 8, 9, 0, 12]
-        before_tokens = tokenizer.decode(sample_ids)
-        before_vocab = tokenizer.get_vocab()
-        tokenizer.save_pretrained(tmpdirname)
-
-        after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
-        after_tokens = after_tokenizer.decode(sample_ids)
-        after_vocab = after_tokenizer.get_vocab()
-
-        self.assertEqual(before_tokens, after_tokens)
-        self.assertDictEqual(before_vocab, after_vocab)
-
-        shutil.rmtree(tmpdirname)
-
-        tokenizer = self.get_tokenizer()
-
-        # Isolate this from the other tests because we save additional tokens/etc
-        tmpdirname = tempfile.mkdtemp()
-
-        before_len = len(tokenizer)
-        sample_ids = [0, 1, 4, 8, 9, 0, 12, before_len, before_len + 1, before_len + 2]
-        tokenizer.add_tokens(["?", "!"])
-        additional_special_tokens = tokenizer.additional_special_tokens
-        additional_special_tokens.append("&")
-        tokenizer.add_special_tokens(
-            {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
-        )
-        before_tokens = tokenizer.decode(sample_ids)
-        before_vocab = tokenizer.get_vocab()
-        tokenizer.save_pretrained(tmpdirname)
-
-        after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
-        after_tokens = after_tokenizer.decode(sample_ids)
-        after_vocab = after_tokenizer.get_vocab()
-
-        self.assertEqual(before_tokens, after_tokens)
-        self.assertDictEqual(before_vocab, after_vocab)
-
-        self.assertTrue(len(tokenizer), before_len + 3)
-        self.assertTrue(len(tokenizer), len(after_tokenizer))
-        shutil.rmtree(tmpdirname)
-
-    def test_tokenizer_slow_store_full_signature(self):
-        signature = inspect.signature(self.tokenizer_class.__init__)
-        tokenizer = self.get_tokenizer()
-
-        for parameter_name, parameter in signature.parameters.items():
-            if parameter.default != inspect.Parameter.empty:
-                self.assertIn(parameter_name, tokenizer.init_kwargs)
-
-    def test_zero_mean_unit_variance_normalization(self):
-        tokenizer = self.get_tokenizer(do_normalize=True)
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        processed = tokenizer(speech_inputs, padding="longest")
-        input_values = processed.input_values
-
-        def _check_zero_mean_unit_variance(input_vector):
-            self.assertTrue(np.abs(np.mean(input_vector)) < 1e-3)
-            self.assertTrue(np.abs(np.var(input_vector) - 1) < 1e-3)
-
-        _check_zero_mean_unit_variance(input_values[0, :800])
-        _check_zero_mean_unit_variance(input_values[1, :1000])
-        _check_zero_mean_unit_variance(input_values[2])
-
-    def test_return_attention_mask(self):
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-
-        # default case -> no attention_mask is returned
-        tokenizer = self.get_tokenizer()
-        processed = tokenizer(speech_inputs)
-        self.assertNotIn("attention_mask", processed)
-
-        # wav2vec2-lv60 -> return attention_mask
-        tokenizer = self.get_tokenizer(return_attention_mask=True)
-        processed = tokenizer(speech_inputs, padding="longest")
-
-        self.assertIn("attention_mask", processed)
-        self.assertListEqual(list(processed.attention_mask.shape), list(processed.input_values.shape))
-        self.assertListEqual(processed.attention_mask.sum(-1).tolist(), [800, 1000, 1200])
-
-    @slow
-    @require_mindspore
-    def test_pretrained_checkpoints_are_set_correctly(self):
-        # this test makes sure that models that are using
-        # group norm don't have their tokenizer return the
-        # attention_mask
-        model_id = "facebook/wav2vec2-base-960h"
-        config = Wav2Vec2Config.from_pretrained(model_id)
-        tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_id)
-
-        # only "layer" feature extraction norm should make use of
-        # attention_mask
-        self.assertEqual(tokenizer.return_attention_mask, config.feat_extract_norm == "layer")
-
-
-class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "facebook/wav2vec2-base-960h"
-    tokenizer_class = Wav2Vec2CTCTokenizer
-    test_rust_tokenizer = False
-
-    def setUp(self):
-        super().setUp()
-
-        vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
-
-        self.tmpdirname = tempfile.mkdtemp()
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def test_tokenizer_add_token_chars(self):
-        tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
-
-        # check adding a single token
-        tokenizer.add_tokens("x")
-        token_ids = tokenizer("C x A").input_ids
-        self.assertEqual(token_ids, [19, 4, 32, 4, 7])
-
-        tokenizer.add_tokens(["a", "b", "c"])
-        token_ids = tokenizer("C a A c").input_ids
-        self.assertEqual(token_ids, [19, 4, 33, 4, 7, 4, 35])
-
-        tokenizer.add_tokens(["a", "b", "c"])
-        token_ids = tokenizer("CaA c").input_ids
-        self.assertEqual(token_ids, [19, 33, 7, 4, 35])
-
-    def test_tokenizer_add_token_words(self):
-        tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
-
-        # check adding a single token
-        tokenizer.add_tokens("xxx")
-        token_ids = tokenizer("C xxx A B").input_ids
-        self.assertEqual(token_ids, [19, 4, 32, 4, 7, 4, 24])
-
-        tokenizer.add_tokens(["aaa", "bbb", "ccc"])
-        token_ids = tokenizer("C aaa A ccc B B").input_ids
-        self.assertEqual(token_ids, [19, 4, 33, 4, 7, 4, 35, 4, 24, 4, 24])
-
-        tokenizer.add_tokens(["aaa", "bbb", "ccc"])
-        token_ids = tokenizer("CaaaA ccc B B").input_ids
-        self.assertEqual(token_ids, [19, 33, 7, 4, 35, 4, 24, 4, 24])
-
-    def test_tokenizer_decode(self):
-        tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
-
-        sample_ids = [
-            [11, 5, 15, tokenizer.pad_token_id, 15, 8, 98],
-            [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77],
-        ]
-        tokens = tokenizer.decode(sample_ids[0])
-        batch_tokens = tokenizer.batch_decode(sample_ids)
-        self.assertEqual(tokens, batch_tokens[0])
-        self.assertEqual(batch_tokens, ["HELLO<unk>", "BYE BYE<unk>"])
-
-    def test_tokenizer_decode_special(self):
-        tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
-
-        # fmt: off
-        sample_ids = [
-            [11, 5, 15, tokenizer.pad_token_id, 15, 8, 98],
-            [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77],
-        ]
-        sample_ids_2 = [
-            [11, 5, 5, 5, 5, 5, 15, 15, 15, tokenizer.pad_token_id, 15, 8, 98],
-            [24, 22, 5, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.word_delimiter_token_id, 24, 22, 5, 77, tokenizer.word_delimiter_token_id],
-        ]
-        # fmt: on
-
-        batch_tokens = tokenizer.batch_decode(sample_ids)
-        batch_tokens_2 = tokenizer.batch_decode(sample_ids_2)
-        self.assertEqual(batch_tokens, batch_tokens_2)
-        self.assertEqual(batch_tokens, ["HELLO<unk>", "BYE BYE<unk>"])
-
-    def test_tokenizer_decode_added_tokens(self):
-        tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
-        tokenizer.add_tokens(["!", "?", "<new_tokens>"])
-        tokenizer.add_special_tokens({"cls_token": "$$$"})
-
-        # fmt: off
-        sample_ids = [
-            [11, 5, 15, tokenizer.pad_token_id, 15, 8, 98, 32, 32, 33, tokenizer.word_delimiter_token_id, 32, 32, 33, 34, 34, 35, 35],
-            [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77, tokenizer.pad_token_id, 34, 34, 35, 35],
-        ]
-        # fmt: on
-        batch_tokens = tokenizer.batch_decode(sample_ids)
-        batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True)
-
-        self.assertEqual(batch_tokens, ["HELLO<unk>!? !?<new_tokens>$$$", "BYE BYE<unk><new_tokens>$$$"])
-        self.assertEqual(batch_tokens_2, ["HELO!? !?<new_tokens>", "BYE BYE<new_tokens>"])
-
-    def test_special_characters_in_vocab(self):
-        sent = "ʈʰ æ æ̃ ˧ kʰ"
-
-        vocab_dict = {k: v for v, k in enumerate(set(sent.split()))}
-        vocab_file = os.path.join(self.tmpdirname, "vocab_special.json")
-
-        with open(vocab_file, "w") as f:
-            json.dump(vocab_dict, f)
-
-        tokenizer = Wav2Vec2CTCTokenizer(vocab_file)  # , unk_token="<unk>")
-
-        expected_sent = tokenizer.decode(tokenizer(sent).input_ids, spaces_between_special_tokens=True)
-        self.assertEqual(sent, expected_sent)
-
-        tokenizer.save_pretrained(os.path.join(self.tmpdirname, "special_tokenizer"))
-        tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(os.path.join(self.tmpdirname, "special_tokenizer"))
-
-        expected_sent = tokenizer.decode(tokenizer(sent).input_ids, spaces_between_special_tokens=True)
-        self.assertEqual(sent, expected_sent)
-
-    @staticmethod
-    def get_from_offsets(offsets, key):
-        retrieved_list = [d[key] for d in offsets]
-        return retrieved_list
-
-    def test_offsets(self):
-        tokenizer = self.get_tokenizer()
-
-        # fmt: off
-        # HEEEEE||LLL<pad>LO<unk> => HE LLO<unk>
-        # 1H + 5E + 2| + 3L + 1<pad> + 1L + 1O + 1<unk>
-        sample_ids = [11, 5, 5, 5, 5, 5, 4, 4, 15, 15, 15, tokenizer.pad_token_id, 15, 8, 98]
-        # fmt: on
-
-        outputs_char = tokenizer.decode(sample_ids, output_char_offsets=True)
-        # check Wav2Vec2CTCTokenizerOutput keys for char
-        self.assertEqual(len(outputs_char.keys()), 2)
-        self.assertTrue("text" in outputs_char)
-        self.assertTrue("char_offsets" in outputs_char)
-        self.assertTrue(isinstance(outputs_char, Wav2Vec2CTCTokenizerOutput))
-
-        outputs_word = tokenizer.decode(sample_ids, output_word_offsets=True)
-        # check Wav2Vec2CTCTokenizerOutput keys for word
-        self.assertEqual(len(outputs_word.keys()), 2)
-        self.assertTrue("text" in outputs_word)
-        self.assertTrue("word_offsets" in outputs_word)
-        self.assertTrue(isinstance(outputs_word, Wav2Vec2CTCTokenizerOutput))
-
-        outputs = tokenizer.decode(sample_ids, output_char_offsets=True, output_word_offsets=True)
-        # check Wav2Vec2CTCTokenizerOutput keys for both
-        self.assertEqual(len(outputs.keys()), 3)
-        self.assertTrue("text" in outputs)
-        self.assertTrue("char_offsets" in outputs)
-        self.assertTrue("word_offsets" in outputs)
-        self.assertTrue(isinstance(outputs, Wav2Vec2CTCTokenizerOutput))
-
-        # check that order of chars is correct and identical for both outputs
-        self.assertEqual("".join(self.get_from_offsets(outputs["char_offsets"], "char")), outputs.text)
-        self.assertEqual(
-            self.get_from_offsets(outputs["char_offsets"], "char"), ["H", "E", " ", "L", "L", "O", "<unk>"]
-        )
-        self.assertListEqual(
-            self.get_from_offsets(outputs["char_offsets"], "char"),
-            self.get_from_offsets(outputs_char["char_offsets"], "char"),
-        )
-
-        # check that order of words is correct and identical to both outputs
-        self.assertEqual(" ".join(self.get_from_offsets(outputs["word_offsets"], "word")), outputs.text)
-        self.assertListEqual(self.get_from_offsets(outputs["word_offsets"], "word"), ["HE", "LLO<unk>"])
-        self.assertListEqual(
-            self.get_from_offsets(outputs["word_offsets"], "word"),
-            self.get_from_offsets(outputs_word["word_offsets"], "word"),
-        )
-
-        # check that offsets are actually correct for char
-        # 0 is H, 1 is E, 6 is | (" "),  8 is 1st L,  12 is 2nd L, 13 is O, 14 is <unk>
-        self.assertListEqual(self.get_from_offsets(outputs["char_offsets"], "start_offset"), [0, 1, 6, 8, 12, 13, 14])
-        # 1 is H, 6 is E, 8 is | (" "),  11 is 1st L (note due to <pad>
-        # different begin of 2nd L), 13 is 2nd L, 14 is O, 15 is <unk>
-        self.assertListEqual(self.get_from_offsets(outputs["char_offsets"], "end_offset"), [1, 6, 8, 11, 13, 14, 15])
-
-        # check that offsets are actually correct for word
-        # H is at 1st position of first word, first L is at 8th position of second word
-        self.assertListEqual(self.get_from_offsets(outputs["word_offsets"], "start_offset"), [0, 8])
-        # last E is at 6th position of first word, first L is at last (15th) position of second word
-        self.assertListEqual(self.get_from_offsets(outputs["word_offsets"], "end_offset"), [6, 15])
-
-    def test_word_offsets_from_char_offsets(self):
-        tokenizer = self.get_tokenizer()
-
-        char_offsets = [
-            {"char": "H", "start_offset": 0, "end_offset": 1},
-            {"char": "I", "start_offset": 1, "end_offset": 2},
-            {"char": " ", "start_offset": 2, "end_offset": 3},
-            {"char": "L", "start_offset": 3, "end_offset": 4},
-            {"char": "I", "start_offset": 4, "end_offset": 5},
-        ]
-        word_offsets = tokenizer._get_word_offsets(char_offsets, tokenizer.replace_word_delimiter_char)
-
-        self.assertEqual(
-            word_offsets,
-            [{"word": "HI", "start_offset": 0, "end_offset": 2}, {"word": "LI", "start_offset": 3, "end_offset": 5}],
-        )
-
-        # Double spaces don't get counted
-        char_offsets = [
-            {"char": " ", "start_offset": 0, "end_offset": 1},
-            {"char": "H", "start_offset": 1, "end_offset": 2},
-            {"char": "I", "start_offset": 2, "end_offset": 3},
-            {"char": " ", "start_offset": 3, "end_offset": 4},
-            {"char": " ", "start_offset": 4, "end_offset": 5},
-            {"char": "L", "start_offset": 5, "end_offset": 6},
-            {"char": "I", "start_offset": 6, "end_offset": 7},
-            {"char": "I", "start_offset": 7, "end_offset": 8},
-            {"char": " ", "start_offset": 8, "end_offset": 9},
-            {"char": " ", "start_offset": 9, "end_offset": 10},
-        ]
-        word_offsets = tokenizer._get_word_offsets(char_offsets, tokenizer.replace_word_delimiter_char)
-        self.assertEqual(
-            word_offsets,
-            [{"word": "HI", "start_offset": 1, "end_offset": 3}, {"word": "LII", "start_offset": 5, "end_offset": 8}],
-        )
-
-    def test_offsets_batch(self):
-        tokenizer = self.get_tokenizer()
-
-        def check_list_tuples_equal(outputs_batch, outputs_list):
-            self.assertTrue(isinstance(outputs_batch, Wav2Vec2CTCTokenizerOutput))
-            self.assertTrue(isinstance(outputs_list[0], Wav2Vec2CTCTokenizerOutput))
-
-            # transform list to ModelOutput
-            outputs_batch_2 = Wav2Vec2CTCTokenizerOutput({k: [d[k] for d in outputs_list] for k in outputs_list[0]})
-
-            self.assertListEqual(outputs_batch["text"], outputs_batch_2["text"])
-
-            def recursive_check(list_or_dict_1, list_or_dict_2):
-                if isinstance(list_or_dict_1, list):
-                    [recursive_check(l1, l2) for l1, l2 in zip(list_or_dict_1, list_or_dict_2)]
-                self.assertEqual(list_or_dict_1, list_or_dict_2)
-
-            if "char_offsets" in outputs_batch:
-                recursive_check(outputs_batch["char_offsets"], outputs_batch_2["char_offsets"])
-
-            if "word_offsets" in outputs_batch:
-                recursive_check(outputs_batch["word_offsets"], outputs_batch_2["word_offsets"])
-
-        # fmt: off
-        sample_ids = [
-            [11, 5, 15, tokenizer.pad_token_id, 15, 4, 8, 98, 32, 32, 32, 32, 4, 33, tokenizer.word_delimiter_token_id, 32, 32, 33, 34, 34],
-            [24, 22, 5, tokenizer.word_delimiter_token_id, tokenizer.word_delimiter_token_id, 24, 22, 22, 22, 4, 5, 77, tokenizer.pad_token_id, 22, 22, 4, 34, 34, 34, 34],
-        ]
-        # fmt: on
-
-        # We assume that `decode` works as expected. All we will check now is
-        # the output type is correct and the output is identical to `decode`
-
-        # char
-        outputs_char_batch = tokenizer.batch_decode(sample_ids, output_char_offsets=True)
-        outputs_char = [tokenizer.decode(ids, output_char_offsets=True) for ids in sample_ids]
-        check_list_tuples_equal(outputs_char_batch, outputs_char)
-
-        # word
-        outputs_word_batch = tokenizer.batch_decode(sample_ids, output_word_offsets=True)
-        outputs_word = [tokenizer.decode(ids, output_word_offsets=True) for ids in sample_ids]
-        check_list_tuples_equal(outputs_word_batch, outputs_word)
-
-        # both
-        outputs_batch = tokenizer.batch_decode(sample_ids, output_char_offsets=True, output_word_offsets=True)
-        outputs = [tokenizer.decode(ids, output_word_offsets=True, output_char_offsets=True) for ids in sample_ids]
-        check_list_tuples_equal(outputs_batch, outputs)
-
-    def test_offsets_integration(self):
-        tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
-        # pred_ids correspond to the following code
-        # ```
-        #        from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC
-        #        from datasets import load_dataset
-        #        import datasets
-        #        import torch
-        #        model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h")
-        #        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
-        #
-        #        ds = load_dataset("common_voice", "en", split="train", streaming=True)
-        #        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
-        #        ds_iter = iter(ds)
-        #        sample = next(ds_iter)
-        #
-        #        input_values = feature_extractor(sample["audio"]["array"], return_tensors="pt").input_values
-        #        logits = model(input_values).logits
-        #        pred_ids = torch.argmax(logits, axis=-1).cpu().tolist()
-        # ```
-        # fmt: off
-        pred_ids = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 11, 0, 0, 0, 22, 0, 0, 4, 4, 4, 14, 0, 0, 0, 0, 0, 8, 8, 0, 5, 5, 0, 12, 0, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 10, 0, 0, 0, 15, 0, 0, 10, 0, 0, 0, 12, 0, 0, 0, 0, 0, 7, 0, 9, 0, 0, 14, 0, 0, 0, 13, 0, 7, 0, 0, 4, 4, 0, 15, 8, 8, 0, 0, 8, 0, 26, 0, 0, 4, 4, 0, 0, 15, 0, 0, 0, 0, 0, 0, 10, 0, 26, 5, 5, 0, 4, 4, 0, 0, 12, 11, 0, 0, 5, 4, 4, 4, 0, 18, 0, 0, 0, 7, 9, 9, 0, 6, 0, 12, 12, 4, 4, 0, 6, 0, 0, 8, 0, 4, 4, 4, 0, 19, 0, 0, 8, 9, 9, 0, 0, 0, 0, 12, 12, 0, 0, 0, 0, 0, 0, 0, 16, 16, 0, 0, 17, 5, 5, 5, 0, 4, 4, 4, 0, 0, 29, 29, 0, 0, 0, 0, 8, 11, 0, 9, 9, 0, 0, 0, 4, 4, 0, 12, 12, 0, 0, 0, 9, 0, 0, 0, 0, 0, 8, 18, 0, 0, 0, 4, 4, 0, 0, 8, 9, 0, 4, 4, 0, 6, 11, 5, 0, 4, 4, 0, 13, 13, 0, 0, 0, 10, 0, 0, 25, 0, 0, 6, 0, 4, 4, 0, 0, 0, 0, 7, 0, 0, 23, 0, 0, 4, 4, 0, 0, 0, 6, 11, 0, 5, 4, 4, 18, 0, 0, 0, 0, 0, 0, 7, 15, 0, 0, 0, 15, 15, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
-
-        # wav2vec2-base downsamples input audio by a factor of 320
-        # sampling rate for wav2vec2-base is 16_000
-        time_offset_wav2vec2_base = 320 / 16_000
-
-        expected_char_time_stamps_text = ['W', 'H', 'Y', ' ', 'D', 'O', 'E', 'S', ' ', 'M', 'I', 'L', 'I', 'S', 'A', 'N', 'D', 'R', 'A', ' ', 'L', 'O', 'O', 'K', ' ', 'L', 'I', 'K', 'E', ' ', 'S', 'H', 'E', ' ', 'W', 'A', 'N', 'T', 'S', ' ', 'T', 'O', ' ', 'C', 'O', 'N', 'S', 'U', 'M', 'E', ' ', 'J', 'O', 'H', 'N', ' ', 'S', 'N', 'O', 'W', ' ', 'O', 'N', ' ', 'T', 'H', 'E', ' ', 'R', 'I', 'V', 'T', ' ', 'A', 'P', ' ', 'T', 'H', 'E', ' ', 'W', 'A', 'L', 'L', ' ']
-        expected_char_time_stamps_start = [1.42, 1.44, 1.52, 1.58, 1.64, 1.76, 1.82, 1.88, 1.92, 2.26, 2.32, 2.4, 2.46, 2.54, 2.66, 2.7, 2.76, 2.84, 2.88, 2.94, 3.0, 3.02, 3.1, 3.14, 3.2, 3.28, 3.42, 3.46, 3.48, 3.54, 3.62, 3.64, 3.7, 3.72, 3.8, 3.88, 3.9, 3.96, 4.0, 4.04, 4.1, 4.16, 4.2, 4.28, 4.34, 4.36, 4.48, 4.66, 4.74, 4.76, 4.84, 4.94, 5.06, 5.08, 5.12, 5.22, 5.28, 5.38, 5.5, 5.52, 5.6, 5.68, 5.7, 5.74, 5.8, 5.82, 5.84, 5.88, 5.94, 6.04, 6.1, 6.16, 6.2, 6.32, 6.38, 6.44, 6.54, 6.56, 6.6, 6.62, 6.66, 6.8, 6.82, 6.9, 6.96]
-        expected_char_time_stamps_end = [1.44, 1.46, 1.54, 1.64, 1.66, 1.8, 1.86, 1.9, 2.06, 2.28, 2.34, 2.42, 2.48, 2.56, 2.68, 2.72, 2.78, 2.86, 2.9, 2.98, 3.02, 3.06, 3.12, 3.16, 3.24, 3.3, 3.44, 3.48, 3.52, 3.58, 3.64, 3.66, 3.72, 3.78, 3.82, 3.9, 3.94, 3.98, 4.04, 4.08, 4.12, 4.18, 4.26, 4.3, 4.36, 4.4, 4.52, 4.7, 4.76, 4.82, 4.9, 4.98, 5.08, 5.1, 5.16, 5.26, 5.32, 5.4, 5.52, 5.54, 5.64, 5.7, 5.72, 5.78, 5.82, 5.84, 5.86, 5.92, 5.98, 6.06, 6.12, 6.18, 6.24, 6.34, 6.4, 6.48, 6.56, 6.58, 6.62, 6.66, 6.68, 6.82, 6.84, 6.94, 7.02]
-
-        expected_word_time_stamps_text = ['WHY', 'DOES', 'MILISANDRA', 'LOOK', 'LIKE', 'SHE', 'WANTS', 'TO', 'CONSUME', 'JOHN', 'SNOW', 'ON', 'THE', 'RIVT', 'AP', 'THE', 'WALL']
-        expected_word_time_stamps_start = [1.42, 1.64, 2.26, 3.0, 3.28, 3.62, 3.8, 4.1, 4.28, 4.94, 5.28, 5.68, 5.8, 5.94, 6.32, 6.54, 6.66]
-        expected_word_time_stamps_end = [1.54, 1.9, 2.9, 3.16, 3.52, 3.72, 4.04, 4.18, 4.82, 5.16, 5.54, 5.72, 5.86, 6.18, 6.4, 6.62, 6.94]
-        # fmt: on
-
-        output = tokenizer.batch_decode(pred_ids, output_char_offsets=True, output_word_offsets=True)
-
-        char_offsets_text = self.get_from_offsets(output["char_offsets"][0], "char")
-        char_offsets_start = self.get_from_offsets(output["char_offsets"][0], "start_offset")
-        char_offsets_end = self.get_from_offsets(output["char_offsets"][0], "end_offset")
-
-        word_offsets_text = self.get_from_offsets(output["word_offsets"][0], "word")
-        word_offsets_start = self.get_from_offsets(output["word_offsets"][0], "start_offset")
-        word_offsets_end = self.get_from_offsets(output["word_offsets"][0], "end_offset")
-
-        # let's transform offsets to time stamps in seconds
-        char_time_stamps_start = [round(c * time_offset_wav2vec2_base, 2) for c in char_offsets_start]
-        char_time_stamps_end = [round(c * time_offset_wav2vec2_base, 2) for c in char_offsets_end]
-
-        word_time_stamps_start = [round(w * time_offset_wav2vec2_base, 2) for w in word_offsets_start]
-        word_time_stamps_end = [round(w * time_offset_wav2vec2_base, 2) for w in word_offsets_end]
-
-        # NOTE: you can verify the above results by checking out the dataset viewer
-        # on https://huggingface.co/datasets/common_voice/viewer/en/train and
-        # downloading / playing the sample `common_voice_en_100038.mp3`. As
-        # you can hear the time-stamps match more or less
-
-        self.assertListEqual(expected_char_time_stamps_text, char_offsets_text)
-        self.assertListEqual(expected_char_time_stamps_start, char_time_stamps_start)
-        self.assertListEqual(expected_char_time_stamps_end, char_time_stamps_end)
-
-        self.assertListEqual(expected_word_time_stamps_text, word_offsets_text)
-        self.assertListEqual(expected_word_time_stamps_start, word_time_stamps_start)
-        self.assertListEqual(expected_word_time_stamps_end, word_time_stamps_end)
-
-    # overwrite from test_tokenization_common
-    def test_add_tokens_tokenizer(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                vocab_size = tokenizer.vocab_size
-                all_size = len(tokenizer)
-
-                self.assertNotEqual(vocab_size, 0)
-
-                # We usually have added tokens from the start in tests because our vocab fixtures are
-                # smaller than the original vocabs - let's not assert this
-                # self.assertEqual(vocab_size, all_size)
-
-                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
-                added_toks = tokenizer.add_tokens(new_toks)
-                vocab_size_2 = tokenizer.vocab_size
-                all_size_2 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_2, 0)
-                self.assertEqual(vocab_size, vocab_size_2)
-                self.assertEqual(added_toks, len(new_toks))
-                self.assertEqual(all_size_2, all_size + len(new_toks))
-
-                tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
-
-                self.assertGreaterEqual(len(tokens), 4)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-3], tokenizer.vocab_size - 1)
-
-                new_toks_2 = {
-                    "eos_token": AddedToken(">>>>|||<||<<|<<", lstrip=False, rstrip=False),
-                    "pad_token": AddedToken("<<<<<|||>|>>>>|>", rstrip=False, lstrip=False),
-                }
-                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-                vocab_size_3 = tokenizer.vocab_size
-                all_size_3 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_3, 0)
-                self.assertEqual(vocab_size, vocab_size_3)
-                self.assertEqual(added_toks_2, len(new_toks_2))
-                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-
-                tokens = tokenizer.encode(
-                    ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
-                )
-
-                self.assertGreaterEqual(len(tokens), 6)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[0], tokens[1])
-                self.assertGreater(tokens[-3], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-3], tokens[-4])
-                self.assertEqual(tokens[0], tokenizer.eos_token_id)
-                self.assertEqual(tokens[-3], tokenizer.pad_token_id)
-
-    @unittest.skip(reason="The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode.")
-    def test_tf_encode_plus_sent_to_model(self):
-        pass
-
-    @unittest.skip(reason="The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode.")
-    def test_torch_encode_plus_sent_to_model(self):
-        pass
-
-    def test_convert_tokens_to_string_format(self):
-        # The default common tokenizer tests assumes that the output of `convert_tokens_to_string` is a string which
-        # is not the case for Wav2vec2.
-        tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                tokens = ["T", "H", "I", "S", "|", "I", "S", "|", "A", "|", "T", "E", "X", "T"]
-                output = tokenizer.convert_tokens_to_string(tokens)
-
-                self.assertIsInstance(output["text"], str)
-
-    def test_nested_vocab(self):
-        eng_vocab = {"a": 7, "b": 8}
-        spa_vocab = {"a": 23, "c": 88}
-        ita_vocab = {"a": 6, "d": 9}
-
-        nested_vocab = {"eng": eng_vocab, "spa": spa_vocab, "ita": ita_vocab}
-
-        def check_tokenizer(tokenizer, check_ita_first=False):
-            if check_ita_first:
-                self.assertEqual(tokenizer.decode([6, 9, 9]), "ad")
-                self.assertEqual(tokenizer.encoder, ita_vocab)
-                tokenizer.set_target_lang("eng")
-
-            self.assertEqual(tokenizer.encoder, eng_vocab)
-            self.assertEqual(tokenizer.decode([7, 8, 7]), "aba")
-
-            tokenizer.set_target_lang("spa")
-            self.assertEqual(tokenizer.decode([23, 88, 23]), "aca")
-            self.assertEqual(tokenizer.encoder, spa_vocab)
-
-            tokenizer.set_target_lang("eng")
-            self.assertEqual(tokenizer.encoder, eng_vocab)
-            self.assertEqual(tokenizer.decode([7, 7, 8]), "ab")
-
-            tokenizer.set_target_lang("ita")
-            self.assertEqual(tokenizer.decode([6, 9, 9]), "ad")
-            self.assertEqual(tokenizer.encoder, ita_vocab)
-
-        with tempfile.TemporaryDirectory() as tempdir:
-            tempfile_path = os.path.join(tempdir, "vocab.json")
-            with open(tempfile_path, "w") as temp_file:
-                json.dump(nested_vocab, temp_file)
-
-            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(tempdir, target_lang="eng")
-
-        check_tokenizer(tokenizer)
-
-        with tempfile.TemporaryDirectory() as tempdir:
-            # should have saved target lang as "ita" since it was last one
-            tokenizer.save_pretrained(tempdir)
-            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(tempdir)
-
-            self.assertEqual(tokenizer.target_lang, "ita")
-            check_tokenizer(tokenizer, check_ita_first=True)
\ No newline at end of file
diff --git a/tests/transformers/models/wav2vec2_bert/__init__.py b/tests/transformers/models/wav2vec2_bert/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py b/tests/transformers/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
deleted file mode 100644
index a8164e97f..000000000
--- a/tests/transformers/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
+++ /dev/null
@@ -1,899 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Wav2Vec2-BERT model."""
-# pylint: disable=too-many-public-methods
-
-import tempfile
-import unittest
-
-from datasets import load_dataset
-
-import numpy as np
-from mindspore import Tensor
-from mindnlp.transformers import Wav2Vec2BertConfig
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    require_mindspore,
-    slow,
-)
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-#from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        AutoFeatureExtractor,
-        Wav2Vec2BertForAudioFrameClassification,
-        Wav2Vec2BertForCTC,
-        Wav2Vec2BertForSequenceClassification,
-        Wav2Vec2BertForXVector,
-        Wav2Vec2BertModel,
-    )
-    from mindnlp.transformers.models.wav2vec2_bert.modeling_wav2vec2_bert import (
-        _compute_mask_indices,
-        _sample_negative_indices,
-    )
-
-
-# Copied from tests.models.wav2vec2_conformer.test_modeling_wav2vec2_conformer.Wav2Vec2ConformerModelTester with Conformer->Bert, input_values->input_features
-class Wav2Vec2BertModelTester:
-    # Ignore copy
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=200,  # speech is longer
-        is_training=False,
-        hidden_size=16,
-        feature_projection_input_dim=16,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        hidden_dropout_prob=0.1,
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        mask_time_prob=0.5,
-        mask_time_length=2,
-        vocab_size=32,
-        do_stable_layer_norm=False,
-        num_adapter_layers=2,
-        adapter_stride=2,
-        tdnn_dim=(32, 32),
-        tdnn_kernel=(5, 3),
-        tdnn_dilation=(1, 2),
-        xvector_output_dim=32,
-        position_embeddings_type="relative",
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feature_projection_input_dim = feature_projection_input_dim
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.num_adapter_layers = num_adapter_layers
-        self.adapter_stride = adapter_stride
-        self.mask_time_prob = mask_time_prob
-        self.mask_time_length = mask_time_length
-        self.scope = scope
-        self.tdnn_dim = tdnn_dim
-        self.tdnn_kernel = tdnn_kernel
-        self.tdnn_dilation = tdnn_dilation
-        self.xvector_output_dim = xvector_output_dim
-        self.position_embeddings_type = position_embeddings_type
-
-        self.output_seq_length = self.seq_length
-        self.encoder_seq_length = self.output_seq_length
-
-        self.adapter_output_seq_length = self.output_seq_length
-
-        for _ in range(num_adapter_layers):
-            self.adapter_output_seq_length = (self.adapter_output_seq_length - 1) // adapter_stride + 1
-
-    # Ignore copy
-    def prepare_config_and_inputs(self, position_embeddings_type="relative"):
-        input_shape = [self.batch_size, self.seq_length, self.feature_projection_input_dim]
-
-        input_features = floats_tensor(input_shape, self.vocab_size)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config(position_embeddings_type=position_embeddings_type)
-
-        return config, input_features, attention_mask
-
-    # Ignore copy
-    def get_config(self, position_embeddings_type="relative"):
-        return Wav2Vec2BertConfig(
-            hidden_size=self.hidden_size,
-            feature_projection_input_dim=self.feature_projection_input_dim,
-            mask_time_prob=self.mask_time_prob,
-            mask_time_length=self.mask_time_length,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            do_stable_layer_norm=self.do_stable_layer_norm,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            num_adapter_layers=self.num_adapter_layers,
-            adapter_stride=self.adapter_stride,
-            tdnn_dim=self.tdnn_dim,
-            tdnn_kernel=self.tdnn_kernel,
-            tdnn_dilation=self.tdnn_dilation,
-            xvector_output_dim=self.xvector_output_dim,
-            position_embeddings_type=position_embeddings_type,
-        )
-
-    def create_and_check_model(self, config, input_features, attention_mask):
-        model = Wav2Vec2BertModel(config=config)
-        model.set_train(False)
-        result = model(input_features, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter(self, config, input_features, attention_mask):
-        config.add_adapter = True
-        model = Wav2Vec2BertModel(config=config)
-        model.set_train(False)
-        result = model(input_features, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter_for_ctc(self, config, input_features, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 2 * config.hidden_size
-        model = Wav2Vec2BertForCTC(config=config)
-        model.set_train(False)
-        result = model(input_features, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.adapter_output_seq_length, self.vocab_size)
-        )
-
-    # Ignore copy
-    def create_and_check_model_with_intermediate_ffn_before_adapter(self, config, input_features, attention_mask):
-        config.add_adapter = True
-        config.use_intermediate_ffn_before_adapter = True
-        model = Wav2Vec2BertModel(config=config)
-        model.set_train(False)
-        result = model(input_features, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
-        )
-
-        # also try with different adapter proj dim
-        config.output_hidden_size = 8
-        model = Wav2Vec2BertModel(config=config)
-        model.set_train(False)
-        result = model(input_features, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
-        )
-
-    def create_and_check_model_with_adapter_proj_dim(self, config, input_features, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 8
-        model = Wav2Vec2BertModel(config=config)
-        model.set_train(False)
-        result = model(input_features, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
-        )
-
-    def create_and_check_model_float16(self, config, input_features, attention_mask):
-        model = Wav2Vec2BertModel(config=config)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            model = Wav2Vec2BertModel.from_pretrained(tmpdirname)
-
-        model.set_train(False)
-
-        # TODO: change to float16
-        result = model(input_features.type(dtype=mindspore.float32), attention_mask=attention_mask)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_batch_inference(self, config, input_features, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = Wav2Vec2BertModel(config=config)
-        model.set_train(False)
-
-        input_features = input_features[:3]
-        attention_mask = ops.ones(input_features.shape, dtype=mindspore.bool_)
-
-        input_lengths = [input_features.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_features, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_features.shape[0]):
-            input_slice = input_features[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(np.allclose(output, batch_output, atol=1e-3))
-
-    def check_ctc_loss(self, config, input_features, *args):
-        model = Wav2Vec2BertForCTC(config=config)
-
-        # make sure that dropout is disabled
-        model.set_train(False)
-
-        input_features = input_features[:3]
-        # Ignore copy
-        attention_mask = ops.ones(input_features.shape[:2], dtype=mindspore.int64)
-
-        input_lengths = [input_features.shape[1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], min(max_length_labels).item() - 1), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_features, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_features, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_seq_classifier_loss(self, config, input_features, *args):
-        model = Wav2Vec2BertForSequenceClassification(config=config)
-
-        # make sure that dropout is disabled
-        model.set_train(False)
-
-        input_features = input_features[:3]
-        # Ignore copy
-        attention_mask = ops.ones(input_features.shape[:2], dtype=mindspore.int64)
-
-        input_lengths = [input_features.shape[1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_features.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        masked_loss = model(input_features, attention_mask=attention_mask, labels=labels).loss.item()
-        unmasked_loss = model(input_features, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-
-    def check_ctc_training(self, config, input_features, *args):
-        config.ctc_zero_infinity = True
-        model = Wav2Vec2BertForCTC(config=config)
-        model.set_train(True)
-
-        # Ignore copy
-        input_features = input_features[:3]
-
-        input_lengths = [input_features.shape[1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], max(max_length_labels) - 2), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lengths are at least
-                # one shorter than logit lengths to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_features, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        loss.backward()
-
-    def check_seq_classifier_training(self, config, input_features, *args):
-        config.ctc_zero_infinity = True
-        model = Wav2Vec2BertForSequenceClassification(config=config)
-        model.set_train(True)
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_features = input_features[:3]
-
-        # Ignore copy
-        input_lengths = [input_features.shape[1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_features.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_features, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        loss.backward()
-
-    def check_xvector_training(self, config, input_features, *args):
-        config.ctc_zero_infinity = True
-        model = Wav2Vec2BertForXVector(config=config)
-        model.set_train(True)
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_features = input_features[:3]
-
-        input_lengths = [input_features.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_features.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_features, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_features, *args):
-        model = Wav2Vec2BertForCTC(config)
-        model.set_train(True)
-
-        input_features = input_features[:3]
-
-        input_lengths = [input_features.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_features.shape[0], max(max_length_labels).item() - 2), model.config.vocab_size + 100)
-
-        with self.parent.assertRaises(ValueError):
-            model(input_features, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_features, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_features": input_features, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-# Copied from tests.models.wav2vec2_conformer.test_modeling_wav2vec2_conformer.Wav2Vec2ConformerModelTest with Conformer->Bert, input_values->input_features
-class Wav2Vec2BertModelTest(ModelTesterMixin, unittest.TestCase):
-    # Ignore copy
-    all_model_classes = (
-        (
-            Wav2Vec2BertForCTC,
-            Wav2Vec2BertModel,
-            Wav2Vec2BertForSequenceClassification,
-            Wav2Vec2BertForAudioFrameClassification,
-            Wav2Vec2BertForXVector,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-
-    pipeline_model_mapping = (
-        {
-            "audio-classification": Wav2Vec2BertForSequenceClassification,
-            "automatic-speech-recognition": Wav2Vec2BertForCTC,
-            "feature-extraction": Wav2Vec2BertModel,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_headmasking = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = Wav2Vec2BertModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Wav2Vec2BertConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_relative(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="relative")
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    # Ignore copy
-    def test_model_with_relative_key(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="relative_key")
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_rotary(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="rotary")
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_no_rel_pos(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type=None)
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
-
-    def test_model_with_adapter_for_ctc(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_for_ctc(*config_and_inputs)
-
-    # Ignore copy
-    def test_model_with_intermediate_ffn_before_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_intermediate_ffn_before_adapter(*config_and_inputs)
-
-    def test_model_with_adapter_proj_dim(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
-
-    def test_model_float16_with_relative(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="relative")
-        self.model_tester.create_and_check_model_float16(*config_and_inputs)
-
-    # Ignore copy
-    def test_model_float16_with_relative_key(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="relative_key")
-        self.model_tester.create_and_check_model_float16(*config_and_inputs)
-
-    def test_model_float16_with_rotary(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="rotary")
-        self.model_tester.create_and_check_model_float16(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    @unittest.skip("skip train")
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    @unittest.skip("skip train")
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    @unittest.skip("skip train")
-    def test_xvector_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_xvector_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # Ignore copy
-    @unittest.skip(reason="Wav2Vec2Bert has no inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    # Ignore copy
-    @unittest.skip(reason="`input_ids` is renamed to `input_features`")
-    def test_forward_signature(self):
-        pass
-
-    # Ignore copy
-    @unittest.skip(reason="Wav2Vec2Bert has no tokens embeddings")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # Ignore copy
-    @unittest.skip(reason="Wav2Vec2Bert has no inputs_embeds")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # Ignore copy
-    @unittest.skip(reason="non-robust architecture does not exist in Flax")
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    # Ignore copy
-    @unittest.skip(reason="non-robust architecture does not exist in Flax")
-    def test_equivalence_pt_to_flax(self):
-        pass
-
-    @unittest.skip("Mindspore has no retain_grad")
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_features = inputs_dict["input_features"]
-
-        input_lengths = mindspore.tensor(
-            [input_features.shape[1] for _ in range(input_features.shape[0])], dtype=mindspore.int64
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_features.shape[0], output_lengths[0] - 2), self.model_tester.vocab_size)
-        inputs_dict["attention_mask"] = ops.ones_like(inputs_dict["attention_mask"])
-        inputs_dict["labels"] = labels
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        attentions = outputs.attentions[0]
-
-        hidden_states.retain_grad()
-        attentions.retain_grad()
-
-        output.flatten()[0].backward(retain_graph=True)
-
-        self.assertIsNotNone(hidden_states.grad)
-        self.assertIsNotNone(attentions.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "pos_bias_v",
-                    "pos_bias_u",
-                    "pointwise_conv1",
-                    "pointwise_conv2",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "pos_bias_u") and module.pos_bias_u is not None:
-            module.pos_bias_u.data.fill_(3)
-        if hasattr(module, "pos_bias_v") and module.pos_bias_v is not None:
-            module.pos_bias_v.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    # Ignore copy
-    @unittest.skip(reason="Kept to make #Copied from working")
-    def test_mask_feature_prob_ctc(self):
-        pass
-
-    # Ignore copy
-    @unittest.skip(reason="Kept to make #Copied from working")
-    def test_mask_time_prob_ctc(self):
-        pass
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        # Ignore copy
-        model = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0")
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-# Copied from tests.models.wav2vec2_conformer.test_modeling_wav2vec2_conformer.Wav2Vec2ConformerUtilsTest with Conformer->Bert, input_values->input_features
-class Wav2Vec2BertUtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = Tensor.from_numpy(mask)
-
-        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
-
-    def test_compute_mask_indices_low_prob(self):
-        # with these settings num_masked_spans=0.5, which means probabilistic rounding
-        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
-        # the other 5 out of 10, cases num_masked_spans=1
-        n_trials = 100
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        count_dimensions_masked = 0
-        count_dimensions_not_masked = 0
-
-        for _ in range(n_trials):
-            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-            mask = Tensor.from_numpy(mask)
-
-            num_masks = ops.sum(mask).item()
-
-            if num_masks > 0:
-                count_dimensions_masked += 1
-            else:
-                count_dimensions_not_masked += 1
-
-        # as we test for at least 10 masked dimension and at least
-        # 10 non-masked dimension, this test could fail with probability:
-        # P(100 coin flips, at most 9 heads) = 1.66e-18
-        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
-        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = Tensor.from_numpy(mask)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-    def test_compute_mask_indices_attn_mask_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        attention_mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-        attention_mask[:2, sequence_length // 2 :] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
-        )
-        mask = Tensor.from_numpy(mask)
-
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
-
-    def test_compute_mask_indices_short_audio(self):
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        attention_mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-        # force one example to be heavily padded
-        attention_mask[0, 5:] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
-        )
-
-        # make sure that non-padded examples cannot be padded
-        self.assertFalse(mask[0][attention_mask[0].to(mindspore.bool_).numpy()].any())
-
-    # Ignore copy
-    @unittest.skip(reason="Kept to make #Copied from working. Test a class used for pretraining, not yet supported.")
-    def test_compute_perplexity(self):
-        pass
-
-    def test_sample_negatives(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        features = (ops.arange(sequence_length * hidden_size) // hidden_size).view(
-            sequence_length, hidden_size
-        )  # each value in vector consits of same value
-        features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size))
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
-        sampled_negative_indices = Tensor.from_numpy(sampled_negative_indices)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        #self.assertEqual(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-        # NOTE: which means [:, :, :, i] is equal for all i
-        self.assertEqual(negatives.shape[:-1], (num_negatives, batch_size, sequence_length))
-        ref = negatives[:, :, :, 0]
-        for i in range(1, negatives.shape[-1]):
-            x = negatives[:, :, :, i]
-            self.assertTrue(ops.all(ref == x))
-
-    def test_sample_negatives_with_mask(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        # second half of last input tensor is padded
-        mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-        mask[-1, sequence_length // 2 :] = 0
-
-        features = (ops.arange(sequence_length * hidden_size) // hidden_size).view(
-            sequence_length, hidden_size
-        )  # each value in vector consits of same value
-        features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size))
-
-        # replace masked feature vectors with -100 to test that those are not sampled
-        features = ops.where(mask[:, :, None].broadcast_to(features.shape).bool(), features, -100)
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices(
-            (batch_size, sequence_length), num_negatives, mask.asnumpy()
-        )
-        sampled_negative_indices = ops.from_numpy(sampled_negative_indices)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-
-        self.assertTrue((negatives >= 0).all().item())
-
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertTrue(ops.unique(negatives, dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
-@require_mindspore
-@slow
-class Wav2Vec2BertModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)])
-        speech_samples = speech_samples[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_inference_w2v2_bert(self):
-        model = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0")
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = feature_extractor(input_speech, return_tensors="ms", padding=True)
-
-        model.set_train(False)
-        outputs = model(**inputs, output_attentions=True)
-
-        # fmt: off
-        expected_slice_0 = mindspore.tensor(
-            [[-0.0098, -0.0570, -0.1286,  0.0439, -0.1037, -0.0235],
-            [-0.0767,  0.0574, -0.3224,  0.0482,  0.0440, -0.0193],
-            [ 0.0220, -0.0878, -0.2027, -0.0028, -0.0666,  0.0721],
-            [ 0.0307, -0.1099,  0.0273, -0.0416, -0.0715,  0.0094],
-            [ 0.0758, -0.0291,  0.1084,  0.0004, -0.0751, -0.0116],
-            [ 0.0349, -0.0343, -0.0098,  0.0415, -0.0617,  0.0241],
-            [-0.0193, -0.0171,  0.1965,  0.0797, -0.0308,  0.2033],
-            [-0.0323, -0.0315,  0.0948,  0.0944, -0.0254,  0.1241],
-            [-0.0493,  0.0010, -0.1762,  0.0034, -0.0787,  0.0832],
-            [ 0.0043, -0.1228, -0.0739,  0.0266, -0.0337, -0.0068]]
-        )
-        # fmt: on
-
-        # fmt: off
-        expected_slice_1 = mindspore.tensor(
-            [[-0.0348, -0.0521, -0.3036,  0.0285, -0.0715, -0.0453],
-            [-0.0102,  0.0114, -0.3266,  0.0027, -0.0558,  0.0038],
-            [ 0.0454,  0.0148, -0.2418, -0.0392, -0.0455,  0.0478],
-            [-0.0013,  0.0825, -0.1730, -0.0091, -0.0426,  0.0360],
-            [-0.0227,  0.0687, -0.1168,  0.0569, -0.0160,  0.0759],
-            [-0.0318,  0.0562, -0.0508,  0.0605,  0.0150,  0.0953],
-            [-0.0415,  0.0438,  0.0233,  0.0336,  0.0262,  0.0860],
-            [-0.0163,  0.0048,  0.0807,  0.0119,  0.0712,  0.0158],
-            [ 0.0244, -0.0145,  0.0262, -0.0237,  0.0283, -0.0125],
-            [-0.0587, -0.0516, -0.0368, -0.0196,  0.0307, -0.1434]]
-        )
-        # fmt: on
-
-        self.assertTrue((outputs.last_hidden_state[0, 25:35, 4:10] - expected_slice_0).abs().max() <= 1e-4)
-        self.assertTrue((outputs.last_hidden_state[1, 25:35, 4:10] - expected_slice_1).abs().max() <= 1e-4)
-
-        self.assertAlmostEqual(outputs.last_hidden_state[1].mean().item(), 3.3123e-05)
-        self.assertAlmostEqual(outputs.last_hidden_state[1].std().item(), 0.1545, delta=2e-5)
-
-        self.assertListEqual(list(outputs.last_hidden_state.shape), [2, 326, 1024])
diff --git a/tests/transformers/models/wav2vec2_bert/test_processor_wav2vec2_bert.py b/tests/transformers/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
deleted file mode 100644
index a0fec4cf3..000000000
--- a/tests/transformers/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Wav2Vec2-BERT processor."""
-
-import json
-import os
-import shutil
-import tempfile
-import unittest
-
-from mindnlp.transformers.models.seamless_m4t import SeamlessM4TFeatureExtractor
-from mindnlp.transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer
-from mindnlp.transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
-from mindnlp.transformers.models.wav2vec2_bert import Wav2Vec2BertProcessor
-from mindnlp.configs import FEATURE_EXTRACTOR_NAME
-
-from ..wav2vec2.test_feature_extraction_wav2vec2 import floats_list
-
-
-# Copied from tests.models.wav2vec2.test_processor_wav2vec2.Wav2Vec2ProcessorTest with Wav2Vec2FeatureExtractor->SeamlessM4TFeatureExtractor, Wav2Vec2Processor->Wav2Vec2BertProcessor
-class Wav2Vec2BertProcessorTest(unittest.TestCase):
-    def setUp(self):
-        vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        self.add_kwargs_tokens_map = {
-            "pad_token": "<pad>",
-            "unk_token": "<unk>",
-            "bos_token": "<s>",
-            "eos_token": "</s>",
-        }
-        feature_extractor_map = {
-            "feature_size": 1,
-            "padding_value": 0.0,
-            "sampling_rate": 16000,
-            "return_attention_mask": False,
-            "do_normalize": True,
-        }
-
-        self.tmpdirname = tempfile.mkdtemp()
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-
-        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(feature_extractor_map) + "\n")
-
-    def get_tokenizer(self, **kwargs_init):
-        kwargs = self.add_kwargs_tokens_map.copy()
-        kwargs.update(kwargs_init)
-        return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_feature_extractor(self, **kwargs):
-        return SeamlessM4TFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def test_save_load_pretrained_default(self):
-        tokenizer = self.get_tokenizer()
-        feature_extractor = self.get_feature_extractor()
-
-        processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        processor.save_pretrained(self.tmpdirname)
-        processor = Wav2Vec2BertProcessor.from_pretrained(self.tmpdirname)
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, SeamlessM4TFeatureExtractor)
-
-    def test_save_load_pretrained_additional_features(self):
-        processor = Wav2Vec2BertProcessor(
-            tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()
-        )
-        processor.save_pretrained(self.tmpdirname)
-
-        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
-        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
-
-        processor = Wav2Vec2BertProcessor.from_pretrained(
-            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
-        )
-
-        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
-
-        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
-        self.assertIsInstance(processor.feature_extractor, SeamlessM4TFeatureExtractor)
-
-    def test_feature_extractor(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        raw_speech = floats_list((3, 1000))
-
-        input_feat_extract = feature_extractor(raw_speech, return_tensors="np", return_attention_mask=True)
-        input_processor = processor(raw_speech, return_tensors="np", return_attention_mask=True)
-
-        for key in input_feat_extract.keys():
-            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
-
-    def test_tokenizer(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        input_str = "This is a test string"
-
-        encoded_processor = processor(text=input_str)
-
-        encoded_tok = tokenizer(input_str)
-
-        for key in encoded_tok.keys():
-            self.assertListEqual(encoded_tok[key], encoded_processor[key])
-
-    def test_tokenizer_decode(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
-
-        decoded_processor = processor.batch_decode(predicted_ids)
-        decoded_tok = tokenizer.batch_decode(predicted_ids)
-
-        self.assertListEqual(decoded_tok, decoded_processor)
-
-    def test_model_input_names(self):
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        self.assertListEqual(
-            processor.model_input_names,
-            feature_extractor.model_input_names,
-            msg="`processor` and `feature_extractor` model input names do not match",
-        )
diff --git a/tests/transformers/models/wav2vec2_conformer/__init__.py b/tests/transformers/models/wav2vec2_conformer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/transformers/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
deleted file mode 100644
index a13f7d8cc..000000000
--- a/tests/transformers/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
+++ /dev/null
@@ -1,936 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the Pymindspore Wav2Vec2-Conformer model."""
-
-import math
-import tempfile
-import unittest
-
-import mindspore
-from mindspore import Tensor
-import numpy as np
-from datasets import load_dataset
-
-from mindnlp.transformers import Wav2Vec2ConformerConfig
-
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    require_mindspore,
-    slow,
-)
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        Wav2Vec2ConformerForAudioFrameClassification,
-        Wav2Vec2ConformerForCTC,
-        Wav2Vec2ConformerForPreTraining,
-        Wav2Vec2ConformerForSequenceClassification,
-        Wav2Vec2ConformerForXVector,
-        Wav2Vec2ConformerModel,
-        Wav2Vec2FeatureExtractor,
-        Wav2Vec2Processor,
-    )
-    from mindnlp.transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import (
-        Wav2Vec2ConformerGumbelVectorQuantizer,
-        _compute_mask_indices,
-        _sample_negative_indices,
-    )
-
-
-class Wav2Vec2ConformerModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=1024,  # speech is longer
-        is_training=False,
-        hidden_size=16,
-        feat_extract_norm="group",
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        hidden_dropout_prob=0.1,
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        mask_time_prob=0.5,
-        mask_time_length=2,
-        vocab_size=32,
-        do_stable_layer_norm=False,
-        num_adapter_layers=1,
-        adapter_stride=2,
-        tdnn_dim=(32, 32),
-        tdnn_kernel=(5, 3),
-        tdnn_dilation=(1, 2),
-        xvector_output_dim=32,
-        position_embeddings_type="relative",
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.num_adapter_layers = num_adapter_layers
-        self.adapter_stride = adapter_stride
-        self.mask_time_prob = mask_time_prob
-        self.mask_time_length = mask_time_length
-        self.scope = scope
-        self.tdnn_dim = tdnn_dim
-        self.tdnn_kernel = tdnn_kernel
-        self.tdnn_dilation = tdnn_dilation
-        self.xvector_output_dim = xvector_output_dim
-        self.position_embeddings_type = position_embeddings_type
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-        self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
-
-    def prepare_config_and_inputs(self, position_embeddings_type="relative"):
-        input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config(position_embeddings_type=position_embeddings_type)
-
-        return config, input_values, attention_mask
-
-    def get_config(self, position_embeddings_type="relative"):
-        return Wav2Vec2ConformerConfig(
-            hidden_size=self.hidden_size,
-            feat_extract_norm=self.feat_extract_norm,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            mask_time_prob=self.mask_time_prob,
-            mask_time_length=self.mask_time_length,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            do_stable_layer_norm=self.do_stable_layer_norm,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            num_adapter_layers=self.num_adapter_layers,
-            adapter_stride=self.adapter_stride,
-            tdnn_dim=self.tdnn_dim,
-            tdnn_kernel=self.tdnn_kernel,
-            tdnn_dilation=self.tdnn_dilation,
-            xvector_output_dim=self.xvector_output_dim,
-            position_embeddings_type=position_embeddings_type,
-        )
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = Wav2Vec2ConformerModel(config=config)
-        model.set_train(False)
-
-        result = model(input_values, attention_mask=attention_mask)
-        
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        model = Wav2Vec2ConformerModel(config=config)
-        model.set_train(False)
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_model_with_adapter_for_ctc(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 2 * config.hidden_size
-        model = Wav2Vec2ConformerForCTC(config=config)
-        model.set_train(False)
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.logits.shape, (self.batch_size, self.adapter_output_seq_length, self.vocab_size)
-        )
-
-    def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
-        config.add_adapter = True
-        config.output_hidden_size = 8
-        model = Wav2Vec2ConformerModel(config=config)
-        model.set_train(False)
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape,
-            (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
-        )
-
-    def create_and_check_model_float16(self, config, input_values, attention_mask):
-        model = Wav2Vec2ConformerModel(config=config)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            model = Wav2Vec2ConformerModel.from_pretrained(tmpdirname)
-
-        model.set_train(False)
-
-        result = model(input_values.type(dtype=mindspore.float32), attention_mask=attention_mask)
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pymindspore/fairseq/issues/3227
-        model = Wav2Vec2ConformerModel(config=config)
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.bool_)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(np.allclose(output.numpy(), batch_output.numpy(), atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = Wav2Vec2ConformerForCTC(config=config)
-
-        # make sure that dropout is disabled
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], min(max_length_labels).item() - 1), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_seq_classifier_loss(self, config, input_values, *args):
-        model = Wav2Vec2ConformerForSequenceClassification(config=config)
-
-        # make sure that dropout is disabled
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor([input_values.shape[0], 1], len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-        unmasked_loss = model(input_values, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-
-    #copied from wav2vec2
-    @unittest.skip('ignore train temporarily')
-    def check_ctc_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Wav2Vec2ConformerForCTC(config=config)
-        model.set_train(True)
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(Tensor(input_lengths))
-        labels = ids_tensor([input_values.shape[0], max(max_length_labels) - 2], model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lengths are at least
-                # one shorter than logit lengths to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        loss.backward()
-
-    def check_seq_classifier_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Wav2Vec2ConformerForSequenceClassification(config=config)
-        model.set_train(True)
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor([input_values.shape[0], 1], len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        # loss.backward()
-
-    def check_xvector_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = Wav2Vec2ConformerForXVector(config=config)
-        model.set_train(True)
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor([input_values.shape[0], 1], len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-        # loss.backward()
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = Wav2Vec2ConformerForCTC(config)
-        
-        model.set_train(True)
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], max(max_length_labels).item() - 2), model.config.vocab_size + 100)
-
-        with self.parent.assertRaises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class Wav2Vec2ConformerModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            Wav2Vec2ConformerForCTC,
-            Wav2Vec2ConformerModel,
-            Wav2Vec2ConformerForSequenceClassification,
-            Wav2Vec2ConformerForPreTraining,
-            Wav2Vec2ConformerForAudioFrameClassification,
-            Wav2Vec2ConformerForXVector,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "audio-classification": Wav2Vec2ConformerForSequenceClassification,
-            "automatic-speech-recognition": Wav2Vec2ConformerForCTC,
-            "feature-extraction": Wav2Vec2ConformerModel,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_headmasking = False
-    test_mindsporescript = False
-
-    def setUp(self):
-        self.model_tester = Wav2Vec2ConformerModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=Wav2Vec2ConformerConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_relative(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="relative")
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_rotary(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="rotary")
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_no_rel_pos(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type=None)
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_with_adapter(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
-
-    def test_model_with_adapter_for_ctc(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_for_ctc(*config_and_inputs)
-
-    def test_model_with_adapter_proj_dim(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
-
-    @require_mindspore
-    def test_model_float16_with_relative(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="relative")
-        self.model_tester.create_and_check_model_float16(*config_and_inputs)
-
-    @require_mindspore
-    def test_model_float16_with_rotary(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs(position_embeddings_type="rotary")
-        self.model_tester.create_and_check_model_float16(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_xvector_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_xvector_training(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # Wav2Vec2Conformer has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_values`
-    def test_forward_signature(self):
-        pass
-
-    # Wav2Vec2Conformer cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # Wav2Vec2Conformer has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_get_set_embeddings(self):
-        pass
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "pos_bias_v",
-                    "pos_bias_u",
-                    "pointwise_conv1",
-                    "pointwise_conv2",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.data.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "pos_bias_u") and module.pos_bias_u is not None:
-            module.pos_bias_u.data.fill_(3)
-        if hasattr(module, "pos_bias_v") and module.pos_bias_v is not None:
-            module.pos_bias_v.data.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.data.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.data.fill_(3)
-
-    def test_mask_feature_prob_ctc(self):
-        model = Wav2Vec2ConformerForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2-conformer", mask_feature_prob=0.2, mask_feature_length=2
-        )
-        model.set_train(True)
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2-conformer", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    def test_mask_time_prob_ctc(self):
-        model = Wav2Vec2ConformerForCTC.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2-conformer", mask_time_prob=0.2, mask_time_length=2
-        )
-        model.set_train(True)
-        processor = Wav2Vec2Processor.from_pretrained(
-            "hf-internal-testing/tiny-random-wav2vec2-conformer", return_attention_mask=True
-        )
-
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        batch = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="ms"
-        )
-
-        logits = model(
-            input_values=batch["input_values"],
-            attention_mask=batch["attention_mask"],
-        ).logits
-
-        self.assertEqual(logits.shape, (4, 1498, 32))
-
-    @unittest.skip(reason="Feed forward chunking is not implemented")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    @unittest.skip('memory exceeded')
-    def test_model_from_pretrained(self):
-        model = Wav2Vec2ConformerModel.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
-        self.assertIsNotNone(model)
-
-def check_unique_values(tensor, dim):
-    # 在指定维度上获取最大值和最小值
-    max_val = ops.reduce_max(tensor, dim)
-    min_val = ops.reduce_min(tensor, dim)
-    # 检查最大值和最小值是否相等
-    unique_values = ops.equal(max_val, min_val)
-    return unique_values
-
-@require_mindspore
-class Wav2Vec2ConformerUtilsTest(unittest.TestCase):
-    def test_compute_mask_indices(self):
-        batch_size = 4
-        sequence_length = 60
-        mask_prob = 0.5
-        mask_length = 1
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = Tensor(mask)
-
-        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
-
-    def test_compute_mask_indices_low_prob(self):
-        # with these settings num_masked_spans=0.5, which means probabilistic rounding
-        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
-        # the other 5 out of 10, cases num_masked_spans=1
-        n_trials = 100
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        count_dimensions_masked = 0
-        count_dimensions_not_masked = 0
-
-        for _ in range(n_trials):
-            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-            mask = Tensor(mask)
-
-            num_masks = ops.sum(mask).item()
-
-            if num_masks > 0:
-                count_dimensions_masked += 1
-            else:
-                count_dimensions_not_masked += 1
-
-        # as we test for at least 10 masked dimension and at least
-        # 10 non-masked dimension, this test could fail with probability:
-        # P(100 coin flips, at most 9 heads) = 1.66e-18
-        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
-        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
-
-    def test_compute_mask_indices_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
-        mask = Tensor(mask)
-
-        # because of overlap mask don't have to add up exactly to `mask_prob * sequence_length`, but have to be smaller or equal
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-    def test_compute_mask_indices_attn_mask_overlap(self):
-        batch_size = 4
-        sequence_length = 80
-        mask_prob = 0.5
-        mask_length = 4
-
-        attention_mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-        attention_mask[:2, sequence_length // 2 :] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask
-        )
-        mask = Tensor(mask)
-
-        for batch_sum in mask.sum(axis=-1):
-            self.assertTrue(int(batch_sum) <= mask_prob * sequence_length)
-
-        self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
-
-    def test_compute_mask_indices_short_audio(self):
-        batch_size = 4
-        sequence_length = 100
-        mask_prob = 0.05
-        mask_length = 10
-
-        attention_mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-        # force one example to be heavily padded
-        attention_mask[0, 5:] = 0
-
-        mask = _compute_mask_indices(
-            (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
-        )
-
-        mask_bool = Tensor(mask[0], mindspore.bool_)
-        attention_mask_bool = attention_mask[0].to(mindspore.bool_)
-        # make sure that non-padded examples cannot be padded
-        self.assertFalse(mask_bool[attention_mask_bool].any())
-
-    def test_compute_perplexity(self):
-        probs = ops.arange(100, dtype=mindspore.float32).reshape(2, 5, 10) / 100
-
-        ppl = Wav2Vec2ConformerGumbelVectorQuantizer._compute_perplexity(probs)
-        self.assertTrue(abs(ppl.item() - 141.4291) < 1e-3)
-
-        # mask half of the input
-        mask = ops.ones((2,), dtype=mindspore.bool_)
-        mask[0] = 0
-
-        ppl = Wav2Vec2ConformerGumbelVectorQuantizer._compute_perplexity(probs, mask)
-        self.assertTrue(abs(ppl.item() - 58.6757) < 1e-3)
-
-    def test_sample_negatives(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        features = (ops.arange(sequence_length * hidden_size) // hidden_size).view(
-            sequence_length, hidden_size
-        )  # each value in vector consits of same value
-        features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size))
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices((batch_size, sequence_length), num_negatives, None)
-        sampled_negative_indices = Tensor(sampled_negative_indices)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        negatives_np = negatives.asnumpy()
-        unique_negatives = np.unique(negatives_np, axis=-1)
-        unique_negatives_tensor = Tensor(unique_negatives, mindspore.float32)
-        
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertTrue(unique_negatives_tensor.shape, (num_negatives, batch_size, sequence_length, 1))
-
-    def test_sample_negatives_with_mask(self):
-        batch_size = 2
-        sequence_length = 10
-        hidden_size = 4
-        num_negatives = 3
-
-        # second half of last input tensor is padded
-        mask = ops.ones((batch_size, sequence_length), dtype=mindspore.int64)
-        mask[-1, sequence_length // 2 :] = 0
-
-        features = (ops.arange(sequence_length * hidden_size) // hidden_size).view(
-            sequence_length, hidden_size
-        )  # each value in vector consits of same value
-        features = features[None, :].broadcast_to((batch_size, sequence_length, hidden_size))
-
-        # replace masked feature vectors with -100 to test that those are not sampled
-        features = ops.where(mask[:, :, None].broadcast_to(features.shape).bool(), features, -100)
-
-        # sample negative indices
-        sampled_negative_indices = _sample_negative_indices(
-            (batch_size, sequence_length), num_negatives, mask.asnumpy()
-        )
-        sampled_negative_indices = ops.from_numpy(sampled_negative_indices)
-        negatives = features.view(-1, hidden_size)[sampled_negative_indices.long().view(-1)]
-        negatives = negatives.view(batch_size, sequence_length, -1, hidden_size).permute(2, 0, 1, 3)
-
-        self.assertTrue((negatives >= 0).all().item())
-
-        self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size))
-
-        # make sure no negatively sampled vector is actually a positive one
-        for negative in negatives:
-            self.assertTrue(((negative - features) == 0).sum() == 0.0)
-
-        # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim
-        self.assertTrue(ops.unique(negatives, dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
-
-@require_mindspore
-@slow
-class Wav2Vec2ConformerModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)])
-        speech_samples = speech_samples[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def test_inference_ctc_normal_batched_rel_pos(self):
-        model = Wav2Vec2ConformerForCTC.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large-960h-ft")
-        
-        processor = Wav2Vec2Processor.from_pretrained(
-            "facebook/wav2vec2-conformer-rel-pos-large-960h-ft", do_lower_case=True
-        )
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-
-        logits = model(input_values).logits
-
-        predicted_ids = ops.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight loincloth that was the only garment he wore",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_ctc_normal_batched_rope(self):
-        model = Wav2Vec2ConformerForCTC.from_pretrained("facebook/wav2vec2-conformer-rope-large-960h-ft")
-        
-        processor = Wav2Vec2Processor.from_pretrained(
-            "facebook/wav2vec2-conformer-rope-large-960h-ft", do_lower_case=True
-        )
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = processor(input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-
-        logits = model(input_values).logits
-
-        predicted_ids = ops.argmax(logits, dim=-1)
-        predicted_trans = processor.batch_decode(predicted_ids)
-
-        EXPECTED_TRANSCRIPTIONS = [
-            "a man said to the universe sir i exist",
-            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
-        ]
-        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
-
-    def test_inference_pretrained(self):
-        model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
-        
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-            "facebook/wav2vec2-conformer-rel-pos-large", return_attention_mask=True
-        )
-        input_speech = self._load_datasamples(2)
-
-        inputs_dict = feature_extractor(input_speech, return_tensors="ms", padding=True)
-
-        batch_size = inputs_dict["input_values"].shape[0]
-        input_lengths = inputs_dict["input_values"].shape[1]
-        feature_seq_length = int(model._get_feat_extract_output_lengths(inputs_dict["input_values"].shape[1]))
-
-        features_shape = (batch_size, feature_seq_length)
-
-        mask_time_indices = _compute_mask_indices(
-            features_shape,
-            model.config.mask_time_prob,
-            model.config.mask_time_length,
-            min_masks=2,
-        )
-        mask_time_indices = Tensor.from_numpy(mask_time_indices)
-
-        outputs = model(
-            inputs_dict.input_values,
-            attention_mask=inputs_dict.attention_mask,
-            mask_time_indices=mask_time_indices,
-        )
-
-        # compute cosine similarity
-        cosine_sim = ops.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
-
-        # retrieve cosine sim of masked features
-        cosine_sim_masked = cosine_sim[mask_time_indices]
-
-        # ... now compare to randomly initialized model
-
-        config = Wav2Vec2ConformerConfig.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
-        model_rand = Wav2Vec2ConformerForPreTraining(config).set_train(False)
-
-        outputs_rand = model_rand(
-            inputs_dict.input_values,
-            attention_mask=inputs_dict.attention_mask,
-            mask_time_indices=mask_time_indices,
-        )
-
-        # compute cosine similarity
-        cosine_sim_rand = ops.cosine_similarity(
-            outputs_rand.projected_states, outputs_rand.projected_quantized_states, dim=-1
-        )
-
-        # retrieve cosine sim of masked features
-        cosine_sim_masked_rand = cosine_sim_rand[mask_time_indices]
-
-        # a pretrained wav2vec2_conformer model has learned to predict the quantized latent states
-        # => the cosine similarity between quantized states and predicted states > 0.5
-        # a random wav2vec2_conformer model has not learned to predict the quantized latent states
-        # => the cosine similarity between quantized states and predicted states is very likely < 0.1
-        self.assertTrue(cosine_sim_masked.mean().item() - 5 * cosine_sim_masked_rand.mean().item() > 0)
diff --git a/tests/transformers/models/wavlm/__init__.py b/tests/transformers/models/wavlm/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/wavlm/test_modeling_wavlm.py b/tests/transformers/models/wavlm/test_modeling_wavlm.py
deleted file mode 100644
index 3e3beeda2..000000000
--- a/tests/transformers/models/wavlm/test_modeling_wavlm.py
+++ /dev/null
@@ -1,614 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the PyTorch WavLM model."""
-
-import math
-import unittest
-
-import pytest
-from datasets import load_dataset
-import numpy as np
-
-from mindnlp.transformers import WavLMConfig
-from mindnlp.utils.testing_utils import require_mindspore, slow, is_mindspore_available
-import mindnlp
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    _config_zero_init,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        Wav2Vec2FeatureExtractor,
-        WavLMForAudioFrameClassification,
-        WavLMForCTC,
-        WavLMForSequenceClassification,
-        WavLMForXVector,
-        WavLMModel,
-    )
-
-
-class WavLMModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=1024,  # speech is longer
-        is_training=False,
-        hidden_size=16,
-        feat_extract_norm="group",
-        feat_extract_dropout=0.0,
-        feat_extract_activation="gelu",
-        conv_dim=(32, 32, 32),
-        conv_stride=(4, 4, 4),
-        conv_kernel=(8, 8, 8),
-        conv_bias=False,
-        num_conv_pos_embeddings=16,
-        num_conv_pos_embedding_groups=2,
-        num_hidden_layers=2,
-        num_attention_heads=2,
-        hidden_dropout_prob=0.1,  # this is most likely not correctly set yet
-        intermediate_size=20,
-        layer_norm_eps=1e-5,
-        hidden_act="gelu",
-        initializer_range=0.02,
-        vocab_size=32,
-        do_stable_layer_norm=False,
-        tdnn_dim=(32, 32),
-        tdnn_kernel=(3, 3),
-        tdnn_dilation=(1, 1),
-        xvector_output_dim=32,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.feat_extract_norm = feat_extract_norm
-        self.feat_extract_dropout = feat_extract_dropout
-        self.feat_extract_activation = feat_extract_activation
-        self.conv_dim = conv_dim
-        self.conv_stride = conv_stride
-        self.conv_kernel = conv_kernel
-        self.conv_bias = conv_bias
-        self.num_conv_pos_embeddings = num_conv_pos_embeddings
-        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.intermediate_size = intermediate_size
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.vocab_size = vocab_size
-        self.do_stable_layer_norm = do_stable_layer_norm
-        self.tdnn_dim = tdnn_dim
-        self.tdnn_kernel = tdnn_kernel
-        self.tdnn_dilation = tdnn_dilation
-        self.xvector_output_dim = xvector_output_dim
-        self.scope = scope
-
-        output_seq_length = self.seq_length
-        for kernel, stride in zip(self.conv_kernel, self.conv_stride):
-            output_seq_length = (output_seq_length - (kernel - 1)) / stride
-        self.output_seq_length = int(math.ceil(output_seq_length))
-        self.encoder_seq_length = self.output_seq_length
-
-    def prepare_config_and_inputs(self):
-        input_values = floats_tensor([self.batch_size, self.seq_length], scale=1.0)
-        attention_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        config = self.get_config()
-
-        return config, input_values, attention_mask
-
-    def get_config(self):
-        return WavLMConfig(
-            hidden_size=self.hidden_size,
-            feat_extract_norm=self.feat_extract_norm,
-            feat_extract_dropout=self.feat_extract_dropout,
-            feat_extract_activation=self.feat_extract_activation,
-            conv_dim=self.conv_dim,
-            conv_stride=self.conv_stride,
-            conv_kernel=self.conv_kernel,
-            conv_bias=self.conv_bias,
-            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
-            num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            intermediate_size=self.intermediate_size,
-            layer_norm_eps=self.layer_norm_eps,
-            hidden_act=self.hidden_act,
-            initializer_range=self.initializer_range,
-            vocab_size=self.vocab_size,
-            tdnn_dim=self.tdnn_dim,
-            tdnn_kernel=self.tdnn_kernel,
-            tdnn_dilation=self.tdnn_dilation,
-            xvector_output_dim=self.xvector_output_dim,
-        )
-
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = WavLMModel(config=config)
-        
-        model.set_train(False)
-
-        result = model(input_values, attention_mask=attention_mask)
-
-
-
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = WavLMModel(config=config)
-        
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.bool_)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(np.allclose(output.asnumpy(), batch_output.asnumpy(), atol=1e-3))
-
-    def check_ctc_loss(self, config, input_values, *args):
-        model = WavLMForCTC(config=config)
-        
-
-        # make sure that dropout is disabled
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.Tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], (min(max_length_labels) - 1).item()), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        model.config.ctc_loss_reduction = "sum"
-        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        model.config.ctc_loss_reduction = "mean"
-        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(sum_loss, float))
-        self.parent.assertTrue(isinstance(mean_loss, float))
-
-    def check_seq_classifier_loss(self, config, input_values, *args):
-        model = WavLMForSequenceClassification(config=config)
-        
-
-        # make sure that dropout is disabled
-        model.set_train(False)
-
-        input_values = input_values[:3]
-        attention_mask = ops.ones(input_values.shape, dtype=mindspore.int64)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0
-
-        masked_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()
-        unmasked_loss = model(input_values, labels=labels).loss.item()
-
-        self.parent.assertTrue(isinstance(masked_loss, float))
-        self.parent.assertTrue(isinstance(unmasked_loss, float))
-        self.parent.assertTrue(masked_loss != unmasked_loss)
-
-    def check_ctc_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = WavLMForCTC(config=config)
-        
-        model.set_train(True)
-
-        # freeze feature encoder
-        model.freeze_feature_encoder()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.Tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], (max(max_length_labels) - 2).item()), model.config.vocab_size)
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-            if max_length_labels[i] < labels.shape[-1]:
-                # it's important that we make sure that target lengths are at least
-                # one shorter than logit lengths to prevent -inf
-                labels[i, max_length_labels[i] - 1 :] = -100
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-
-    def check_seq_classifier_training(self, config, input_values, *args):
-        config.ctc_zero_infinity = True
-        model = WavLMForSequenceClassification(config=config)
-        
-        model.set_train(True)
-
-        # freeze everything but the classification head
-        model.freeze_base_model()
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        labels = ids_tensor((input_values.shape[0], 1), len(model.config.id2label))
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-
-        loss = model(input_values, labels=labels).loss
-        self.parent.assertFalse(ops.isinf(loss).item())
-
-
-    def check_output_attentions(self, config, input_values, attention_mask):
-        model = WavLMModel(config=config)
-        model.config.layerdrop = 1.0
-        
-        model.set_train(True)
-
-        outputs = model(input_values, attention_mask=attention_mask, output_attentions=True)
-        self.parent.assertTrue(len(outputs.attentions) > 0)
-
-    def check_labels_out_of_vocab(self, config, input_values, *args):
-        model = WavLMForCTC(config)
-        
-        model.set_train(True)
-
-        input_values = input_values[:3]
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-        max_length_labels = model._get_feat_extract_output_lengths(mindspore.Tensor(input_lengths))
-        labels = ids_tensor((input_values.shape[0], (max(max_length_labels) - 2).item()), model.config.vocab_size + 100)
-
-        with pytest.raises(ValueError):
-            model(input_values, labels=labels)
-
-    def prepare_config_and_inputs_for_common(self):
-        config, input_values, attention_mask = self.prepare_config_and_inputs()
-        inputs_dict = {"input_values": input_values, "attention_mask": attention_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class WavLMModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (WavLMForCTC, WavLMModel, WavLMForAudioFrameClassification, WavLMForSequenceClassification, WavLMForXVector)
-        if is_mindspore_available()
-        else ()
-    )
-    pipeline_model_mapping = (
-        {
-            "audio-classification": WavLMForSequenceClassification,
-            "automatic-speech-recognition": WavLMForCTC,
-            "feature-extraction": WavLMModel,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    test_pruning = False
-    test_headmasking = False
-
-    def setUp(self):
-        self.model_tester = WavLMModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=WavLMConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_ctc_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_loss(*config_and_inputs)
-
-    def test_seq_classifier_loss_inference(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_loss(*config_and_inputs)
-
-    def test_ctc_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_ctc_training(*config_and_inputs)
-
-    def test_seq_classifier_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_seq_classifier_training(*config_and_inputs)
-
-    def test_output_attentions(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_output_attentions(*config_and_inputs)
-
-    def test_labels_out_of_vocab(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.check_labels_out_of_vocab(*config_and_inputs)
-
-    # WavLM has no inputs_embeds
-    def test_inputs_embeds(self):
-        pass
-
-    # `input_ids` is renamed to `input_values`
-    def test_forward_signature(self):
-        pass
-
-    # WavLM cannot resize token embeddings
-    # since it has no tokens embeddings
-    def test_resize_tokens_embeddings(self):
-        pass
-
-    # WavLM has no inputs_embeds
-    # and thus the `get_input_embeddings` fn
-    # is not implemented
-    def test_model_get_set_embeddings(self):
-        pass
-
-    # WavLM uses PyTorch's multi-head-attention class
-    # and thus can't retain gradients on attentions
-    def test_retain_grad_hidden_states_attentions(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.output_hidden_states = True
-        config.output_attentions = True
-
-        # no need to test all models as different heads yield the same functionality
-        model_class = self.all_model_classes[0]
-        model = model_class(config)
-
-
-        # set layer drop to 0
-        model.config.layerdrop = 0.0
-
-        input_values = inputs_dict["input_values"]
-
-        input_lengths = mindspore.Tensor(
-            [input_values.shape[1] for _ in range(input_values.shape[0])], dtype=mindspore.int64
-        )
-        output_lengths = model._get_feat_extract_output_lengths(input_lengths)
-
-        labels = ids_tensor((input_values.shape[0], (output_lengths[0] - 2).item()), self.model_tester.vocab_size)
-        try:
-            inputs_dict["attention_mask"] = ops.ones_like(inputs_dict["attention_mask"])
-        except:
-            inputs_dict["attention_mask"] = ops.ones_like(inputs_dict["attention_mask"])
-
-        inputs_dict["labels"] = labels
-
-
-        outputs = model(**inputs_dict)
-
-        output = outputs[0]
-
-        # Encoder-/Decoder-only models
-        hidden_states = outputs.hidden_states[0]
-        # hidden_states.retain_grad()
-        # self.assertIsNotNone(hidden_states.grad)
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                uniform_init_parms = [
-                    "conv.weight",
-                    "conv.parametrizations.weight",
-                    "masked_spec_embed",
-                    "codevectors",
-                    "quantizer.weight_proj.weight",
-                    "project_hid.weight",
-                    "project_hid.bias",
-                    "project_q.weight",
-                    "project_q.bias",
-                    "feature_projection.projection.weight",
-                    "feature_projection.projection.bias",
-                    "label_embeddings_concat",
-                    "rel_attn_embed",
-                    "objective.weight",
-                ]
-                if param.requires_grad:
-                    if any(x in name for x in uniform_init_parms):
-                        self.assertTrue(
-                            -1.0 <= ((param.mean() * 1e9).round() / 1e9).item() <= 1.0,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    else:
-                        self.assertIn(
-                            ((param.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.fill_(3)
-        if hasattr(module, "weight_g") and module.weight_g is not None:
-            module.weight_g.fill_(3)
-        if hasattr(module, "weight_v") and module.weight_v is not None:
-            module.weight_v.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.fill_(3)
-        if hasattr(module, "codevectors") and module.codevectors is not None:
-            module.codevectors.fill_(3)
-        if hasattr(module, "masked_spec_embed") and module.masked_spec_embed is not None:
-            module.masked_spec_embed.fill_(3)
-
-    @unittest.skip(reason="Feed forward chunking is not implemented for WavLM")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model = WavLMModel.from_pretrained("microsoft/wavlm-base-plus", from_pt=True)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-# @require_mindsporeaudio
-@slow
-class WavLMModelIntegrationTest(unittest.TestCase):
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").filter(
-            lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]
-        )[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def _load_superb(self, task, num_samples):
-        ds = load_dataset("anton-l/superb_dummy", task, split="test")
-
-        return ds[:num_samples]
-
-    def test_inference_base(self):
-        model = WavLMModel.from_pretrained("microsoft/wavlm-base-plus", from_pt=True)
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-            "microsoft/wavlm-base-plus", return_attention_mask=True, from_pt=True
-        )
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = feature_extractor(input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-
-        with mindspore._no_grad():
-            hidden_states_slice = (
-                model(input_values, attention_mask=attention_mask).last_hidden_state[:, -2:, -2:]
-            )
-
-        EXPECTED_HIDDEN_STATES_SLICE = mindspore.Tensor(
-            [[[0.0577, 0.1161], [0.0579, 0.1165]], [[0.0199, 0.1237], [0.0059, 0.0605]]]
-        )
-
-        self.assertTrue(np.allclose(hidden_states_slice.asnumpy(), EXPECTED_HIDDEN_STATES_SLICE.asnumpy(), atol=5e-2))
-
-    def test_inference_large(self):
-        model = WavLMModel.from_pretrained("microsoft/wavlm-large", from_pt=True)
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-            "microsoft/wavlm-large", return_attention_mask=True, from_pt=True
-        )
-
-        input_speech = self._load_datasamples(2)
-
-        inputs = feature_extractor(input_speech, return_tensors="ms", padding=True)
-
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-
-        with mindspore._no_grad():
-            hidden_states_slice = (
-                model(input_values, attention_mask=attention_mask).last_hidden_state[:, -2:, -2:]
-            )
-
-        EXPECTED_HIDDEN_STATES_SLICE = mindspore.Tensor(
-            [[[0.2122, 0.0500], [0.2118, 0.0563]], [[0.1353, 0.1818], [0.2453, 0.0595]]]
-        )
-
-        self.assertTrue(np.allclose(hidden_states_slice.asnumpy(), EXPECTED_HIDDEN_STATES_SLICE.asnumpy(), rtol=5e-2))
-
-    def test_inference_diarization(self):
-        model = WavLMForAudioFrameClassification.from_pretrained("microsoft/wavlm-base-plus-sd", from_pt=True)
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base-plus-sd", from_pt=True)
-        input_data = self._load_superb("sd", 4)
-        inputs = processor(input_data["speech"], return_tensors="ms", padding=True, sampling_rate=16_000)
-        input_values = inputs.input_values
-        attention_mask = inputs.attention_mask
-
-        with mindspore._no_grad():
-            outputs = model(input_values, attention_mask=attention_mask)
-        # labels is a one-hot array of shape (num_frames, num_speakers)
-        labels = (outputs.logits > 0).long()
-
-        # s3prl logits for the same batch
-        expected_logits = mindspore.Tensor(
-            [
-                [[-5.9566, -8.6554], [-5.7137, -8.9386], [-5.7906, -7.0973], [-5.7829, -5.9999]],
-                [[-5.2086, -7.7878], [-4.8890, -7.9312], [-4.2004, -3.9101], [-5.4480, -4.6932]],
-                [[-4.6105, -6.7178], [-5.1930, -6.1635], [-2.6228, -4.1123], [-2.7646, -3.1576]],
-                [[-4.4477, -7.9206], [-3.9339, -7.3707], [-4.9528, -4.8242], [-3.6921, -2.9687]],
-            ],
-        )
-
-
-        self.assertEqual(labels[0, :, 0].sum(), 258)
-        self.assertEqual(labels[0, :, 1].sum(), 647)
-        self.assertTrue(np.allclose(outputs.logits[:, :4].asnumpy(), expected_logits.asnumpy(), atol=1e-2))
-
-    def test_inference_speaker_verification(self):
-        model = WavLMForXVector.from_pretrained("microsoft/wavlm-base-plus-sv", from_pt=True)
-        processor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base-plus-sv", from_pt=True)
-        input_data = self._load_superb("si", 4)
-        inputs = processor(input_data["speech"], return_tensors="ms", padding=True)
-        labels = mindspore.Tensor([5, 1, 1, 3]).T
-
-        with mindspore._no_grad():
-            input_values = inputs.input_values
-            attention_mask = inputs.attention_mask
-            outputs = model(input_values, attention_mask=attention_mask, labels=labels)
-        embeddings = mindnlp.modules.functional.normalize(outputs.embeddings, dim=-1)
-
-        # cosine_sim = mindspore.nn.CosineSimilarity(dim=-1)
-        # id10002 vs id10002
-        self.assertAlmostEqual(ops.cosine_similarity(embeddings[1], embeddings[2], dim=-1).item(), 0.9787, 3)
-        # id10006 vs id10002
-        self.assertAlmostEqual(ops.cosine_similarity(embeddings[0], embeddings[1], dim=-1).item(), 0.5064, 3)
-        # id10002 vs id10004
-        self.assertAlmostEqual(ops.cosine_similarity(embeddings[2], embeddings[3], dim=-1).item(), 0.4780, 3)
-
-        self.assertAlmostEqual(outputs.loss.item(), 18.4154, 2)
diff --git a/tests/transformers/models/whisper/__init__.py b/tests/transformers/models/whisper/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/whisper/test_modeling_whisper.py b/tests/transformers/models/whisper/test_modeling_whisper.py
deleted file mode 100644
index e6e5ce9dc..000000000
--- a/tests/transformers/models/whisper/test_modeling_whisper.py
+++ /dev/null
@@ -1,3226 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore Whisper model."""
-
-import copy
-import inspect
-import os
-import random
-import re
-import tempfile
-import time
-import unittest
-
-import numpy as np
-from parameterized import parameterized
-
-import mindnlp
-from mindnlp.transformers import WhisperConfig
-from mindnlp.utils.testing_utils import (
-    is_flaky,
-    require_mindspore,
-    require_mindspore_gpu,
-    slow,
-)
-from mindnlp.utils import cached_property, is_mindspore_available
-from mindnlp.utils.import_utils import is_datasets_available
-from huggingface_hub import hf_hub_download
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_datasets_available():
-    import datasets
-    from datasets import Audio, load_dataset
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore.dataset.audio import Resample
-    from mindnlp.core import nn, ops, no_grad
-    from mindnlp.data.io.audio import read
-    from mindnlp.engine import set_seed
-
-    from mindnlp.transformers import (
-        WhisperFeatureExtractor,
-        WhisperForAudioClassification,
-        WhisperForCausalLM,
-        WhisperForConditionalGeneration,
-        WhisperModel,
-        WhisperProcessor,
-    )
-    from mindnlp.transformers.generation import (
-        GenerateEncoderDecoderOutput,
-    )
-    from mindnlp.transformers.generation.logits_process import LogitsProcessor
-    from mindnlp.transformers.models.whisper.modeling_whisper import WhisperDecoder, WhisperEncoder, sinusoids
-
-    class DummyTimestampLogitProcessor(LogitsProcessor):
-        """This processor fakes the correct timestamps tokens pattern [TOK_1] [TOK_2] ... [TOK_N] [TIME_STAMP_TOK_1] [TIME_STAMP_TOK_2] [TOK_N+1] ..."""
-
-        def __init__(
-            self, timestamp_begin, vocab_size, batch_size, max_length, min_space=3, seed=0, is_length_ascending=True
-        ):
-            self.timestamp_begin = timestamp_begin
-            self.vocab_size = vocab_size
-
-            self.min_space_between_timestamps = min_space
-            self.timestamp_tokens = ops.arange(self.timestamp_begin, self.vocab_size)
-            self.is_length_ascending = is_length_ascending
-
-            self.no_time_stamp_counter = batch_size * [0]
-            self.prev_highest_timestamp = batch_size * [0]
-            self.batch_size = batch_size
-            self.max_length = max_length
-            self.count = 0
-            self.begin_index = 0
-
-            self.let_pass = [[] for _ in range(batch_size)]
-            for k in range(batch_size):
-                random.seed(seed + k)
-                for _ in range(10000):
-                    self.let_pass[k].append(random.randint(1, 10) <= 3)
-
-        def set_begin_index(self, begin_index: int):
-            self.begin_index = begin_index
-
-        def __call__(self, input_ids: mindspore.Tensor, scores: mindspore.Tensor) -> mindspore.Tensor:
-            # we don't want to randomely sample timestamp tokens
-            if input_ids.shape[-1] != self.begin_index:
-                scores[:, self.timestamp_begin :] = float(ops.finfo(scores.dtype).min)
-
-            self.no_time_stamp_counter = [x + 1 for x in self.no_time_stamp_counter]
-            for k in range(input_ids.shape[0]):
-                # make sure to use correct index if a batch was removed
-                if self.is_length_ascending and input_ids.shape[0] < self.batch_size:
-                    prev_k = k + self.batch_size - input_ids.shape[0]
-                else:
-                    prev_k = k
-
-                if input_ids[k, -1] == self.timestamp_begin:
-                    self.no_time_stamp_counter[prev_k] = 0
-
-                can_produce = self.no_time_stamp_counter[prev_k] > self.min_space_between_timestamps
-                must_produce = (
-                    input_ids[k][2:].le(self.timestamp_begin).all() and input_ids.shape[-1] == self.max_length - 1
-                )
-                # produce timestamp with 30%
-                if (can_produce and self.let_pass[prev_k][self.count]) or must_produce:
-                    self.no_time_stamp_counter[prev_k] = 0
-                    self.prev_highest_timestamp[prev_k] = max(input_ids[k].max() + 1, self.timestamp_tokens[0].item())
-
-                    # force a timestamp
-                    scores[k, :] = float(ops.finfo(scores.dtype).min)
-                    scores[k, self.prev_highest_timestamp[prev_k]] = 10.0
-
-                if (
-                    input_ids.shape[-1] > 3
-                    and input_ids[k, -1].item() in self.timestamp_tokens
-                    and input_ids[k, -2].item() not in self.timestamp_tokens
-                ):
-                    # force the same as before
-                    scores[k, :] = float(ops.finfo(scores.dtype).min)
-                    scores[k, input_ids[k, -1].item()] = 10.0
-
-            self.count += 1
-
-            if ops.isinf(scores).all():
-                raise ValueError("Dummy logit processor is incorrectly set up. Scores should not be all inf.")
-
-            return scores
-
-
-def prepare_whisper_inputs_dict(
-    config,
-    input_features,
-    decoder_input_ids,
-    attention_mask=None,
-    decoder_attention_mask=None,
-    head_mask=None,
-    decoder_head_mask=None,
-    cross_attn_head_mask=None,
-):
-    if decoder_attention_mask is None:
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-    if head_mask is None:
-        head_mask = ops.ones(config.encoder_layers, config.encoder_attention_heads)
-    if decoder_head_mask is None:
-        decoder_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    if cross_attn_head_mask is None:
-        cross_attn_head_mask = ops.ones(config.decoder_layers, config.decoder_attention_heads)
-    return {
-        # "input_ids": input_features,
-        "input_features": input_features,
-        "decoder_input_ids": decoder_input_ids,
-        "decoder_attention_mask": decoder_attention_mask,
-        "head_mask": head_mask,
-        "decoder_head_mask": decoder_head_mask,
-        "cross_attn_head_mask": cross_attn_head_mask,
-    }
-
-
-@require_mindspore
-class WhisperModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,  # need batch_size != num_hidden_layers
-        seq_length=60,
-        is_training=True,
-        use_labels=False,
-        vocab_size=200,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        input_channels=1,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        max_source_positions=30,
-        max_target_positions=40,
-        bos_token_id=98,
-        eos_token_id=98,
-        pad_token_id=0,
-        num_mel_bins=80,
-        decoder_start_token_id=85,
-        num_conv_layers=1,
-        suppress_tokens=None,
-        begin_suppress_tokens=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.input_channels = input_channels
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_mel_bins = num_mel_bins
-        self.max_position_embeddings = max_position_embeddings
-        self.max_source_positions = max_source_positions
-        self.max_target_positions = max_target_positions
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.num_conv_layers = num_conv_layers
-        self.suppress_tokens = suppress_tokens
-        self.begin_suppress_tokens = begin_suppress_tokens
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
-
-        decoder_input_ids = mindspore.tensor(self.batch_size * [[self.decoder_start_token_id]])
-
-        config = self.get_config()
-        inputs_dict = prepare_whisper_inputs_dict(
-            config,
-            attention_mask=None,
-            input_features=input_features,
-            decoder_input_ids=decoder_input_ids,
-        )
-        return config, inputs_dict
-
-    def get_config(self):
-        return WhisperConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            input_channels=self.input_channels,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            max_source_positions=self.max_source_positions,
-            max_target_positions=self.max_target_positions,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_ffn_dim=self.hidden_size,
-            encoder_ffn_dim=self.hidden_size,
-            decoder_start_token_id=self.decoder_start_token_id,
-            suppress_tokens=self.suppress_tokens,
-            begin_suppress_tokens=self.begin_suppress_tokens,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_subsampled_output_lengths(self, input_lengths):
-        """
-        Computes the output length of the convolutional layers
-        """
-
-        for i in range(self.num_conv_layers):
-            input_lengths = (input_lengths - 1) // 2 + 1
-
-        return input_lengths
-
-    def create_and_check_model_forward(self, config, inputs_dict, freeze_encoder=False):
-        model = WhisperModel(config=config).eval()
-
-        if freeze_encoder:
-            model.freeze_encoder()
-
-        input_features = inputs_dict["input_features"]
-        decoder_input_ids = inputs_dict["decoder_input_ids"]
-
-        # first forward pass
-        last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
-
-        self.parent.assertTrue(last_hidden_state.shape, (13, 7, 16))
-
-    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
-        model = WhisperModel(config=config).get_decoder().eval()
-        input_ids = inputs_dict["decoder_input_ids"]
-        attention_mask = inputs_dict["decoder_attention_mask"]
-
-        # first forward pass
-        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size).clamp(2)
-        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([attention_mask, next_attn_mask.to(attention_mask.dtype)], dim=-1)
-
-        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
-
-    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
-        model = WhisperModel(config=config).eval()
-        outputs = model(**inputs_dict)
-
-        encoder_last_hidden_state = outputs.encoder_last_hidden_state
-        last_hidden_state = outputs.last_hidden_state
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            encoder = model.get_encoder()
-            encoder.save_pretrained(tmpdirname)
-            encoder = WhisperEncoder.from_pretrained(tmpdirname)
-
-        encoder_last_hidden_state_2 = encoder(inputs_dict["input_features"])[0]
-
-        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            decoder = model.get_decoder()
-            decoder.save_pretrained(tmpdirname)
-            decoder = WhisperDecoder.from_pretrained(tmpdirname)
-
-        last_hidden_state_2 = decoder(
-            input_ids=inputs_dict["decoder_input_ids"],
-            attention_mask=inputs_dict["decoder_attention_mask"],
-            encoder_hidden_states=encoder_last_hidden_state,
-        )[0]
-
-        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
-
-
-@require_mindspore
-class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (WhisperModel, WhisperForConditionalGeneration) if is_mindspore_available() else ()
-    all_generative_model_classes = (WhisperForConditionalGeneration,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "audio-classification": WhisperForAudioClassification,
-            "automatic-speech-recognition": WhisperForConditionalGeneration,
-            "feature-extraction": WhisperModel,
-            "text-generation": WhisperForCausalLM,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    is_encoder_decoder = True
-    fx_compatible = False
-    test_pruning = False
-    test_missing_keys = False
-    # Needs higher percentages after model tester's vocab_size is changed to 200 (PR #21222)
-    # `0.5` is for `test_disk_offload` (which also works for `test_model_parallelism`)
-    model_split_percents = [0.5, 0.8, 0.9]
-
-    input_name = "input_features"
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name in [
-            "AutomaticSpeechRecognitionPipelineTests",
-            "AudioClassificationPipelineTests",
-        ]:
-            # RuntimeError: The size of tensor a (1500) must match the size of tensor b (30) at non-singleton
-            # dimension 1
-            return True
-
-        return False
-
-    def _get_logits_processor_kwargs(self, do_sample=False, config=None):
-        # Overwritten from `GenerationTesterMixin`, Whisper needs `"temperature": 0.0` to be able to do beam search
-        logits_processor_kwargs = super()._get_logits_processor_kwargs(do_sample=do_sample, config=config)
-        logits_processor_kwargs["temperature"] = 0.0
-        return logits_processor_kwargs
-
-    def _get_beam_kwargs(self, num_return_sequences=1):
-        # Overwritten from `GenerationTesterMixin`, Whisper's `num_return_sequences` differs from the core `generate`
-        beam_kwargs = super()._get_beam_kwargs(num_return_sequences=num_return_sequences)
-        beam_kwargs["num_return_sequences"] = beam_kwargs["num_beams"]
-        return beam_kwargs
-
-    def _get_diverse_beam_kwargs(self, num_return_sequences=1):
-        # Overwritten from `GenerationTesterMixin`, Whisper's `num_return_sequences` differs from the core `generate`
-        beam_kwargs = super()._get_diverse_beam_kwargs(num_return_sequences=num_return_sequences)
-        beam_kwargs["num_return_sequences"] = beam_kwargs["num_beams"]
-        return beam_kwargs
-
-    def _get_constrained_beam_kwargs(self, num_return_sequences=1):
-        # Overwritten from `GenerationTesterMixin`, Whisper's `num_return_sequences` differs from the core `generate`
-        beam_kwargs = super()._get_constrained_beam_kwargs(num_return_sequences=num_return_sequences)
-        beam_kwargs["num_return_sequences"] = beam_kwargs["num_beams"]
-        return beam_kwargs
-
-    def setUp(self):
-        self.model_tester = WhisperModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=WhisperConfig)
-        self.maxDiff = 3000
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_save_load_strict(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
-            self.assertEqual(info["missing_keys"], [])
-
-    def test_model_forward(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_forward(*config_and_inputs)
-
-    def test_model_forward_with_frozen_encoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_forward(*config_and_inputs, freeze_encoder=True)
-
-    def test_requires_grad_with_frozen_encoder(self):
-        config = self.model_tester.get_config()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.freeze_encoder()
-
-            try:
-                encoder_grads = [param.requires_grad for param in model.encoder.parameters()]
-                decoder_grads = [param.requires_grad for param in model.decoder.parameters()]
-            except AttributeError:
-                encoder_grads = [param.requires_grad for param in model.model.encoder.parameters()]
-                decoder_grads = [param.requires_grad for param in model.model.decoder.parameters()]
-
-            self.assertFalse(all(encoder_grads))
-            self.assertTrue(all(decoder_grads))
-
-    def test_requires_grad_encoder_embed_positions(self):
-        config = self.model_tester.get_config()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            encoder = model.get_encoder()
-            self.assertFalse(encoder.embed_positions.weight.requires_grad)
-
-    def test_encoder_sinusoidal_embed_positions(self):
-        config = self.model_tester.get_config()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            embeds = model.get_encoder().embed_positions.weight
-            self.assertTrue(ops.allclose(embeds, sinusoids(*embeds.shape)))
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_encoder_decoder_model_standalone(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
-        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
-
-    # def _get_input_ids_and_config(self, batch_size=3):
-    #     config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    #     input_ids = inputs_dict[self.input_name]
-
-    #     # cut to half length & take max batch_size=batch_size
-    #     input_ids = input_ids[:batch_size, :, :]
-
-    #     if config.eos_token_id is not None and config.pad_token_id is None:
-    #         # hack to allow generate for models such as GPT2 as is done in `generate()`
-    #         config.pad_token_id = config.eos_token_id
-
-    #     return config, input_ids, None
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            decoder_input_ids = inputs.pop("decoder_input_ids", None)
-            inputs.pop("decoder_attention_mask", None)
-
-            wte = model.get_input_embeddings()
-            inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with no_grad():
-                model(**inputs)[0]
-
-    # training is not supported yet
-    @unittest.skip(reason="Training is not supported yet")
-    def test_training(self):
-        pass
-
-    @unittest.skip(reason="Training is not supported yet")
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip
-    def test_generate_with_head_masking(self):
-        pass
-
-    @require_mindspore
-    def test_generate_fp16(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        config.max_target_positions = 400
-        input_features = input_dict["input_features"]
-        model = WhisperForConditionalGeneration(config).eval()
-        input_features = input_features.half()
-        model.half()
-        model.generate(input_features)
-        model.generate(input_features, num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
-
-    def test_generate_language(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs()
-        input_features = input_dict["input_features"]
-        model = WhisperForConditionalGeneration(config)
-        # Hack to keep the test fast and not require downloading a model with a generation_config
-        model.generation_config.__setattr__("lang_to_id", {"<|en|>": 1})
-        model.generation_config.__setattr__("task_to_id", {"transcribe": 2})
-
-        # test language code
-        model.generate(input_features, language="en")
-        # test language token
-        model.generate(input_features, language="<|en|>")
-        # test language name
-        model.generate(input_features, language="English")
-        # test language code list
-        model.generate(input_features, language=["en"] * input_features.shape[0])
-        # test language token list
-        model.generate(input_features, language=["<|en|>"] * input_features.shape[0])
-        # test language name list
-        model.generate(input_features, language=["English"] * input_features.shape[0])
-        # test list of the wrong length
-        with self.assertRaises(ValueError):
-            model.generate(input_features, language=["en"] * (input_features.shape[0] + 1))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = [
-                "input_features",
-                "attention_mask",
-                "decoder_input_ids",
-                "decoder_attention_mask",
-            ]
-            expected_arg_names.extend(
-                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
-                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
-                else ["encoder_outputs"]
-            )
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            subsampled_seq_length = model._get_feat_extract_output_lengths(seq_length)
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [subsampled_seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", 1)
-
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", 1)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", 1)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-
-            subsampled_encoder_seq_length = model._get_feat_extract_output_lengths(encoder_seq_length)
-            subsampled_encoder_key_length = model._get_feat_extract_output_lengths(encoder_key_length)
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
-            )
-            out_len = len(outputs)
-
-            correct_outlen = 5
-
-            # loss is at first position
-            if "labels" in inputs_dict:
-                correct_outlen += 1  # loss is added to beginning
-            if "past_key_values" in outputs:
-                correct_outlen += 1  # past_key_values have been returned
-
-            self.assertEqual(out_len, correct_outlen)
-
-            # decoder attentions
-            decoder_attentions = outputs.decoder_attentions
-            self.assertIsInstance(decoder_attentions, (list, tuple))
-            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(decoder_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-            )
-
-            # cross attentions
-            cross_attentions = outputs.cross_attentions
-            self.assertIsInstance(cross_attentions, (list, tuple))
-            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(cross_attentions[0].shape[-3:]),
-                [
-                    self.model_tester.num_attention_heads,
-                    decoder_seq_length,
-                    subsampled_encoder_key_length,
-                ],
-            )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            added_hidden_states = 2
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
-            )
-
-    def test_resize_tokens_embeddings(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            self.skipTest(reason="test_resize_embeddings is False")
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            if self.model_tester.is_training is False:
-                model.eval()
-
-            model_vocab_size = config.vocab_size
-            # Retrieve the embeddings and clone theme
-            model_embed = model.resize_token_embeddings(model_vocab_size)
-            cloned_embeddings = model_embed.weight.clone()
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
-
-            # make sure that decoder_input_ids are resized
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = inputs_dict["decoder_input_ids"].clamp(max=model_vocab_size - 15 - 1)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                if p1.ne(p2).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-    def test_resize_embeddings_untied(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            self.skipTest(reason="test_resize_embeddings is False")
-
-        original_config.tie_word_embeddings = False
-
-        # if model cannot untied embeddings -> leave test
-        if original_config.tie_word_embeddings:
-            self.skipTest(reason="Model cannot untie embeddings")
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            # if no output embeddings -> leave test
-            if model.get_output_embeddings() is None:
-                continue
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_vocab_size = config.vocab_size
-            model.resize_token_embeddings(model_vocab_size + 10)
-            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model.resize_token_embeddings(model_vocab_size - 15)
-            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = inputs_dict["decoder_input_ids"].clamp(max=model_vocab_size - 15 - 1)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-    @unittest.skip
-    def test_generate_without_input_ids(self):
-        pass
-
-    @staticmethod
-    def _get_encoder_outputs(
-        model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
-    ):
-        encoder = model.get_encoder()
-        encoder_outputs = encoder(
-            input_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-        )
-        encoder_outputs["last_hidden_state"] = ops.repeat_interleave(
-            encoder_outputs.last_hidden_state, num_interleave, dim=0
-        )
-        generation_config = copy.deepcopy(model.generation_config)
-        model._prepare_special_tokens(generation_config)
-        input_ids = input_ids[:, :, 0]
-        input_ids = ops.zeros_like(input_ids[:, :1], dtype=mindspore.int64) + generation_config.decoder_start_token_id
-        attention_mask = None
-        return encoder_outputs, input_ids, attention_mask
-
-    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
-        batch_size, mel, seq_length = input_ids.shape
-        subsampled_seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
-        num_sequences_in_output = batch_size * num_return_sequences
-        gen_len = (
-            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
-        )
-
-        # scores
-        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
-
-        # Attentions
-        # encoder
-        self._check_encoder_attention_for_generate(
-            output.encoder_attentions, batch_size, config, subsampled_seq_length
-        )
-        # decoder
-        self._check_attentions_for_generate(
-            num_sequences_in_output,
-            output.decoder_attentions,
-            min_length=1,
-            max_length=output.sequences.shape[-1],
-            config=config,
-            use_cache=use_cache,
-        )
-
-        # Hidden States
-        # encoder
-        self._check_encoder_hidden_states_for_generate(
-            output.encoder_hidden_states, batch_size, config, subsampled_seq_length
-        )
-
-        # decoder
-        self._check_hidden_states_for_generate(
-            num_sequences_in_output,
-            output.decoder_hidden_states,
-            min_length=1,
-            max_length=output.sequences.shape[-1],
-            config=config,
-            use_cache=use_cache,
-        )
-
-    def test_mask_feature_prob(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.mask_feature_prob = 0.2
-        config.mask_feature_length = 2
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.train()
-
-            # forward pass
-            encoder_last_hidden_state = model(**input_dict).encoder_last_hidden_state
-            self.assertTrue(encoder_last_hidden_state.shape, (13, 30, 16))
-
-    def test_mask_time_prob(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.mask_time_prob = 0.2
-        config.mask_time_length = 2
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.train()
-
-            # forward pass
-            encoder_last_hidden_state = model(**input_dict).encoder_last_hidden_state
-            self.assertTrue(encoder_last_hidden_state.shape, (13, 30, 16))
-
-    def test_generate_with_prompt_ids_and_task_and_language(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = WhisperForConditionalGeneration(config).eval()
-        input_features = input_dict["input_features"]
-        prompt_ids = ops.arange(5)
-        language = "<|de|>"
-        task = "translate"
-        lang_id = 6
-        task_id = 7
-        model.generation_config.__setattr__("lang_to_id", {language: lang_id})
-        model.generation_config.__setattr__("task_to_id", {task: task_id})
-
-        output = model.generate(input_features, max_new_tokens=5, task=task, language=language, prompt_ids=prompt_ids)
-
-        expected_output_start = [
-            *prompt_ids.tolist(),
-            model.generation_config.decoder_start_token_id,
-            lang_id,
-            task_id,
-        ]
-        for row in output.tolist():
-            self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
-
-    def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = WhisperForConditionalGeneration(config).eval()
-        input_features = input_dict["input_features"]
-        prompt_ids = ops.arange(5)
-        forced_decoder_ids = [(1, 6), (2, 7), (3, 8)]
-
-        output = model.generate(
-            input_features, max_new_tokens=5, forced_decoder_ids=forced_decoder_ids, prompt_ids=prompt_ids
-        )
-
-        expected_output_start = [
-            *prompt_ids.tolist(),
-            model.generation_config.decoder_start_token_id,
-            *[token for _rank, token in forced_decoder_ids],
-        ]
-        for row in output.tolist():
-            self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
-
-    def test_generate_with_prompt_ids_max_length(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.max_target_positions = 7
-
-        model = WhisperForConditionalGeneration(config).eval()
-        input_features = input_dict["input_features"]
-        decoder_input_ids = ops.arange(5)
-        prompt_ids = decoder_input_ids[:4]
-        max_new_tokens = 8
-
-        with self.assertRaisesRegex(
-            ValueError,
-            f"The length of `decoder_input_ids`, including special start tokens, prompt tokens, and previous tokens, is {decoder_input_ids.shape[-1]}, "
-            f" and `max_new_tokens` is {max_new_tokens}. Thus, the combined length of "
-            f"`decoder_input_ids` and `max_new_tokens` is: {max_new_tokens + decoder_input_ids.shape[-1]}. This exceeds the "
-            f"`max_target_positions` of the Whisper model: {config.max_target_positions}. "
-            "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, "
-            f"so that their combined length is less than {config.max_target_positions}.",
-        ):
-            model.generate(input_features, max_new_tokens=max_new_tokens, prompt_ids=prompt_ids)
-
-        model.generate(input_features, max_new_tokens=1, prompt_ids=prompt_ids)
-
-    def test_generate_longform_with_prompt_ids(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = WhisperForConditionalGeneration(config).eval()
-
-        prompt_ids = ops.arange(5)
-        model.generation_config.no_timestamps_token_id = 11
-        model.generation_config.pad_token_id = 10
-
-        # make sure prompt token ids [0-9] can't be generated
-        model.generation_config.suppress_tokens = list(range(10))
-
-        input_features = input_dict["input_features"]
-
-        language = "<|de|>"
-        lang_id = 6
-
-        input_features = input_features.tile((1, 1, 50))
-        attention_mask = ops.ones_like(input_features, dtype=mindspore.int64)[:, 0]
-
-        for prompt_type in ["first-segment", "all-segments"]:
-            for task_id, task in enumerate(["translate", "transcribe"]):
-                task_id = 7 + task_id
-
-                model.generation_config.__setattr__("lang_to_id", {language: lang_id})
-                model.generation_config.__setattr__("task_to_id", {task: task_id})
-
-                output = model.generate(
-                    input_features,
-                    attention_mask=attention_mask,
-                    prompt_condition_type=prompt_type,
-                    max_new_tokens=5,
-                    task=task,
-                    language=language,
-                    prompt_ids=prompt_ids,
-                    condition_on_prev_tokens=True,
-                )
-                for row in output.tolist():
-                    # make sure no token below 10 is in generated output => this means for long-form prompt ids should NOT be returned
-                    assert not any(i in row for i in model.generation_config.suppress_tokens)
-
-    def _check_longform_generate_single_batch(self, condition_on_prev_tokens):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        model = WhisperForConditionalGeneration(config).eval()
-        input_features = input_dict["input_features"]
-
-        # len = 250 with num_input_frames = 60
-        long_input_features = ops.cat([input_features.tile((1, 1, 4)), input_features[:, :, :10]], dim=-1)
-
-        # force bsz=1
-        long_input_features = long_input_features[:1]
-        vocab_size = model.config.vocab_size
-
-        batch_size = 1
-        num_timestamp_tokens = 20
-        max_length = 16
-        logits_processor = [
-            DummyTimestampLogitProcessor(
-                vocab_size - num_timestamp_tokens,
-                vocab_size,
-                batch_size=batch_size,
-                max_length=max_length,
-                min_space=4,
-            )
-        ]
-
-        # each chunk should not be longer than 10
-        model.generation_config.max_length = max_length
-
-        # if input features are long can't set return_timestamps to False
-        with self.assertRaises(ValueError):
-            _ = model.generate(long_input_features, logits_processor=logits_processor, return_timestamps=False)
-
-        # if input features are long need to set generation config
-        with self.assertRaises(ValueError):
-            _ = model.generate(long_input_features, logits_processor=logits_processor)
-
-        timestamp_begin = vocab_size - num_timestamp_tokens
-        model.generation_config.no_timestamps_token_id = timestamp_begin - 1
-        model.generation_config.eos_token_id = None
-        model.config.eos_token_id = None
-        model.generation_config._detect_timestamp_from_logprob = False
-        # make sure that we only have the same begin token
-        model.generation_config.max_initial_timestamp_index = 0
-        model.generation_config.prev_bos_token_id = timestamp_begin - 3
-
-        gen_kwargs = {
-            "logits_processor": logits_processor,
-            "return_segments": True,
-            "condition_on_prev_tokens": condition_on_prev_tokens,
-        }
-
-        if condition_on_prev_tokens:
-            gen_kwargs["no_speech_threshold"] = 0.6
-            gen_kwargs["temperature"] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
-            gen_kwargs["compression_ratio_threshold"] = 2.4
-            gen_kwargs["logprob_threshold"] = -1.0
-
-        outputs = model.generate(long_input_features, **gen_kwargs)
-
-        segments = outputs["segments"][0]
-
-        for _, segment in enumerate(segments):
-            assert segment["start"] <= segment["end"], "start has to be smaller equal end"
-            assert any(
-                s > timestamp_begin for s in segment["tokens"][1:]
-            ), f"At least one segment token should be a timestamp token, but not first., {segment['tokens']}"
-            assert (
-                segment["tokens"].shape[-1] <= max_length
-            ), "make sure that no segment is larger than max generation length"
-
-    def test_longform_generate_single_batch(self):
-        self._check_longform_generate_single_batch(condition_on_prev_tokens=False)
-
-    def test_longform_generate_single_batch_cond_prev(self):
-        self._check_longform_generate_single_batch(condition_on_prev_tokens=True)
-
-    def _check_longform_generate_multi_batch(self, condition_on_prev_tokens):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        model = WhisperForConditionalGeneration(config).eval()
-        input_features = input_dict["input_features"]
-        input_features = input_features[:2]
-
-        # len = 250 with num_input_frames = 60
-        long_input_features = ops.cat([input_features.tile((1, 1, 4)), input_features[:, :, :10]], dim=-1)
-        input_features_2 = long_input_features[1:]
-        attention_mask = ops.ones(
-            (2, long_input_features.shape[-1]), dtype=input_features.dtype
-        )
-        attention_mask[0, 200:] = 0
-
-        # force bsz=1
-        vocab_size = model.config.vocab_size
-
-        batch_size = 1
-        num_timestamp_tokens = 20
-        max_new_tokens = 16
-        timestamp_begin = vocab_size - num_timestamp_tokens
-        model.generation_config.no_timestamps_token_id = timestamp_begin - 1
-        model.generation_config.eos_token_id = None
-        model.config.eos_token_id = None
-        model.generation_config._detect_timestamp_from_logprob = False
-        # make sure that we only have the same begin token
-        model.generation_config.max_initial_timestamp_index = 0
-        model.generation_config.max_new_tokens = max_new_tokens
-        model.generation_config.prev_bos_token_id = timestamp_begin - 3
-
-        logits_processor = [
-            DummyTimestampLogitProcessor(
-                vocab_size - num_timestamp_tokens,
-                vocab_size,
-                batch_size=batch_size,
-                max_length=max_new_tokens,
-                min_space=4,
-                seed=1,
-            )
-        ]
-        outputs_2 = model.generate(
-            input_features_2,
-            max_new_tokens=max_new_tokens,
-            logits_processor=logits_processor,
-            condition_on_prev_tokens=condition_on_prev_tokens,
-            return_segments=True,
-        )
-        tokens_2 = outputs_2["sequences"][0]
-        segments_2 = outputs_2["segments"][0]
-
-        batch_size = 2
-        logits_processor = [
-            DummyTimestampLogitProcessor(
-                vocab_size - num_timestamp_tokens,
-                vocab_size,
-                batch_size=batch_size,
-                max_length=max_new_tokens,
-                min_space=4,
-                seed=0,
-            )
-        ]
-        gen_kwargs = {
-            "logits_processor": logits_processor,
-            "return_segments": True,
-            "condition_on_prev_tokens": condition_on_prev_tokens,
-            "attention_mask": attention_mask,
-            "max_new_tokens": max_new_tokens,
-        }
-
-        outputs = model.generate(long_input_features, **gen_kwargs)
-        tokens = outputs["sequences"][1]
-        segments = outputs["segments"][1]
-
-        # make sure batched and non-batched is the same
-        assert tokens_2.tolist() == tokens[: tokens_2.shape[-1]].tolist()
-
-        for seg1, seg2 in zip(segments_2, segments):
-            assert seg1["start"] == seg2["start"]
-            assert seg1["end"] == seg2["end"]
-            assert seg1["tokens"].tolist() == seg2["tokens"].tolist()
-
-    def test_longform_generate_multi_batch(self):
-        self._check_longform_generate_multi_batch(condition_on_prev_tokens=False)
-
-    def test_longform_generate_multi_batch_cond_prev(self):
-        self._check_longform_generate_multi_batch(condition_on_prev_tokens=True)
-
-    @is_flaky()  # TODO (joao, sanchit): fails ~9% of the times. Does the original test have the same issue?
-    def test_custom_4d_attention_mask(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = WhisperForConditionalGeneration(config).to(dtype=mindspore.float32)
-        model.eval()
-
-        (
-            input_ids,
-            position_ids,
-            input_ids_shared_prefix,
-            mask_shared_prefix,
-            position_ids_shared_prefix,
-        ) = self._get_custom_4d_mask_test_data()
-
-        with no_grad():
-            logits = model.forward(
-                decoder_input_ids=input_ids,
-                input_features=input_dict["input_features"],
-                decoder_position_ids=position_ids,
-            ).logits
-            # logits.shape == ([3, 4, ...])
-
-            logits_shared_prefix = model(
-                decoder_input_ids=input_ids_shared_prefix,
-                input_features=input_dict["input_features"],
-                decoder_attention_mask=mask_shared_prefix,
-                decoder_position_ids=position_ids_shared_prefix,
-            )[0]
-            # logits_shared_prefix.shape == ([1, 6, ...])
-
-        out_last_tokens = logits[:, -1, :]  # last tokens in each batch line
-        out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :]  # last three tokens
-
-        # comparing softmax-normalized logits:
-        normalized_0 = nn.functional.softmax(out_last_tokens)
-        normalized_1 = nn.functional.softmax(out_shared_prefix_last_tokens)
-        assert ops.allclose(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
-
-    @parameterized.expand([(True,), (False,)])
-    def test_generate_output_type(self, return_dict_in_generate):
-        expected_output_type = GenerateEncoderDecoderOutput if return_dict_in_generate else mindspore.Tensor
-        for model_class in self.all_generative_model_classes:
-            config, inputs = self.model_tester.prepare_config_and_inputs()
-            model = model_class(config).eval()
-
-            # short-form generation without fallback
-            pred_ids = model.generate(**inputs, return_dict_in_generate=return_dict_in_generate)
-            assert isinstance(pred_ids, expected_output_type)
-
-            # short-form generation with fallback
-            pred_ids = model.generate(
-                **inputs,
-                logprob_threshold=-1.0,
-                temperature=[0.0, 0.1],
-                return_dict_in_generate=return_dict_in_generate,
-            )
-            assert isinstance(pred_ids, expected_output_type)
-
-
-@require_mindspore
-class WhisperModelIntegrationTests(unittest.TestCase):
-    def setUp(self):
-        self._unpatched_generation_mixin_generate = mindnlp.transformers.GenerationMixin.generate
-
-    def tearDown(self):
-        mindnlp.transformers.GenerationMixin.generate = self._unpatched_generation_mixin_generate
-
-    @cached_property
-    def default_processor(self):
-        return WhisperProcessor.from_pretrained("openai/whisper-base")
-
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return [x["array"] for x in speech_samples]
-
-    def _patch_generation_mixin_generate(self, check_args_fn=None):
-        test = self
-
-        def generate(self, *args, **kwargs):
-            if check_args_fn is not None:
-                check_args_fn(*args, **kwargs)
-            return test._unpatched_generation_mixin_generate(self, *args, **kwargs)
-
-        mindnlp.transformers.GenerationMixin.generate = generate
-
-    @slow
-    def test_tiny_logits_librispeech(self):
-        set_seed(2345)
-        model = WhisperModel.from_pretrained("openai/whisper-tiny")
-        input_speech = self._load_datasamples(1)
-        feature_extractor = WhisperFeatureExtractor()
-        input_features = feature_extractor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-
-        with no_grad():
-            logits = model(
-                input_features,
-                decoder_input_ids=mindspore.tensor([[50258, 50259, 50359]]),
-                output_hidden_states=False,
-                output_attentions=False,
-                return_dict=False,
-                use_cache=False,
-            )
-
-        # fmt: off
-        EXPECTED_LOGITS = mindspore.tensor(
-            [
-                2.9892, -6.7607, 5.7348, 3.6096, 0.2152, -5.7321, 4.8855, -1.6407,
-                0.2823, -1.5718, 10.4269, 3.4427, 0.0219, -8.0612, 3.4784, 8.4246,
-                4.0575, -2.2864, 11.1084, 0.9963, 0.9884, -8.5154, -3.5469, -9.3713,
-                0.9786, 3.5435, 7.4850, -5.2579, -1.4366, 10.4841
-            ]
-        )
-        # fmt: on
-        self.assertTrue(ops.allclose(logits[0][0, 0, :30], EXPECTED_LOGITS, atol=1e-4))
-
-        # fmt: off
-        EXPECTED_GENERATION = mindspore.tensor(
-            [
-                -1.4651, -2.6944, 2.7821, 2.3793, 4.0738, 0.0188, -3.3203, 1.9836,
-                0.0520, 0.7095, 1.1063, 0.2952, -3.6786, -0.5249, 0.3105, 4.7691,
-                1.1562, 1.3046, 0.5810, -0.3624, 1.7006, 1.3424, 0.9817, 2.1958,
-                1.8775, -5.7046, -0.7679, 4.0113, 2.6848, 2.8609
-            ]
-        )
-        # fmt: on
-
-        head_logits = logits[0] @ model.decoder.embed_tokens.weight.T
-        self.assertTrue(ops.allclose(head_logits[0, 0, :30], EXPECTED_GENERATION, atol=1e-4))
-
-    @slow
-    def test_small_en_logits_librispeech(self):
-        set_seed(2345)
-        model = WhisperModel.from_pretrained("openai/whisper-small.en")
-
-        input_speech = self._load_datasamples(1)
-
-        feaure_extractor = WhisperFeatureExtractor()
-        input_features = feaure_extractor(input_speech, return_tensors="ms").input_features
-
-        logits = model(
-            input_features,
-            decoder_input_ids=mindspore.tensor([[model.config.decoder_start_token_id]]),
-            output_hidden_states=False,
-            output_attentions=False,
-            use_cache=False,
-        )
-
-        logits = logits.last_hidden_state @ model.decoder.embed_tokens.weight.T
-
-        # fmt: off
-        EXPECTED_LOGITS = mindspore.tensor(
-            [
-                -3.6784, -7.7211, -9.5070, -11.9286, -7.6489, -9.7026, -5.6188,
-                -8.0104, -4.6238, -5.1833, -9.0485, -3.4079, -5.4874, -2.6935,
-                -6.3479, -7.3398, -6.9558, -7.6867, -7.4748, -8.3463, -9.9781,
-                -10.8389, -10.3105, -11.7201, -9.7261, -7.1590, -5.9272, -12.4509,
-                -11.1146, -8.1918
-            ]
-        )
-        # fmt: on
-        self.assertTrue(ops.allclose(logits[0, 0, :30], EXPECTED_LOGITS, atol=1e-4))
-
-    @slow
-    def test_large_logits_librispeech(self):
-        set_seed(2345)
-
-        model = WhisperModel.from_pretrained("openai/whisper-large")
-
-        input_speech = self._load_datasamples(1)
-
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
-        processed_inputs = processor(
-            audio=input_speech,
-            text="This part of the speech",
-            add_special_tokens=False,
-            return_tensors="ms",
-            sampling_rate=16_000,
-        )
-        input_features = processed_inputs.input_features
-        decoder_input_ids = processed_inputs.labels
-
-        logits = model(
-            input_features,
-            decoder_input_ids=decoder_input_ids,
-            output_hidden_states=False,
-            output_attentions=False,
-            use_cache=False,
-        )
-
-        logits = logits.last_hidden_state @ model.decoder.embed_tokens.weight.T
-
-        # fmt: off
-        EXPECTED_LOGITS = mindspore.tensor(
-            [
-                2.1382, 0.9381, 4.4671, 3.5589, 2.4022, 3.8576, -0.6521, 2.5472,
-                1.8301, 1.9957, 2.3432, 1.4678, 0.5459, 2.2597, 1.5179, 2.5357,
-                1.1624, 0.6194, 1.0757, 1.8259, 2.4076, 1.6601, 2.3503, 1.3376,
-                1.9891, 1.8635, 3.8931, 5.3699, 4.4772, 3.9184
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(ops.allclose(logits[0, 0, :30], EXPECTED_LOGITS, atol=1e-4))
-
-    @slow
-    def test_tiny_en_generation(self):
-        set_seed(2345)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-        model.config.decoder_start_token_id = 50257
-
-        input_speech = self._load_datasamples(1)
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-
-        generated_ids = model.generate(input_features, num_beams=5, max_length=20)
-        transcript = processor.tokenizer.batch_decode(generated_ids)[0]
-
-        EXPECTED_TRANSCRIPT = (
-            "<|startoftranscript|><|notimestamps|> Mr. Quilter is the apostle of the middle"
-            " classes, and we are glad to"
-        )
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    @slow
-    def test_tiny_generation(self):
-        set_seed(2345)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-
-        input_speech = self._load_datasamples(1)
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-
-        generated_ids = model.generate(input_features, num_beams=5, max_length=20)
-        transcript = processor.tokenizer.decode(generated_ids[0])
-
-        EXPECTED_TRANSCRIPT = (
-            "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle"
-            " classes and we are glad"
-        )
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    @slow
-    def test_large_generation(self):
-        set_seed(123)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
-
-        input_speech = self._load_datasamples(1)
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-
-        generated_ids = model.generate(
-            input_features, do_sample=False, max_length=20, language="<|en|>", task="transcribe"
-        )
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-
-        EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad"
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    @slow
-    def test_large_generation_multilingual(self):
-        set_seed(2345)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
-
-        ds = load_dataset(
-            "facebook/multilingual_librispeech", "german", split="test", streaming=True, trust_remote_code=True
-        )
-        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
-
-        input_speech = next(iter(ds))["audio"]["array"]
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-
-        generated_ids = model.generate(
-            input_features, do_sample=False, max_length=20, language="<|de|>", task="transcribe"
-        )
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        EXPECTED_TRANSCRIPT = " Mein sechster Sohn scheint, wenigstens auf den ersten Blick,"
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-        generated_ids = model.generate(
-            input_features, do_sample=False, max_length=20, language="<|de|>", task="translate"
-        )
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        EXPECTED_TRANSCRIPT = " My sixth son seems, at least at first glance, the most deeply-minded"
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    @slow
-    def test_large_batched_generation(self):
-        set_seed(2345)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
-
-        input_speech = self._load_datasamples(4)
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-        generated_ids = model.generate(input_features, max_length=20, task="translate")
-
-        # fmt: off
-        EXPECTED_LOGITS = mindspore.tensor(
-            [
-                [50258, 50259, 50358, 50363, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 293, 321, 366, 5404],
-                [50258, 50259, 50358, 50363, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50257],
-                [50258, 50259, 50358, 50363, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904],
-                [50258, 50259, 50358, 50363, 634, 575, 12525, 22618, 1968, 6144, 35617, 20084, 1756, 311, 589, 307, 534, 10281, 934, 439]
-            ]
-        )
-        # fmt: on
-
-        self.assertTrue(ops.allclose(generated_ids, EXPECTED_LOGITS))
-
-        # fmt: off
-        EXPECTED_TRANSCRIPT = [
-            " Mr. Quilter is the apostle of the middle classes and we are glad",
-            " Nor is Mr. Quilter's manner less interesting than his matter.",
-            " He tells us that at this festive season of the year, with Christmas and roast",
-            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all",
-        ]
-        # fmt: on
-
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
-        self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    @slow
-    def test_large_batched_generation_multilingual(self):
-        set_seed(2345)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
-
-        token = os.getenv("HF_HUB_READ_TOKEN", True)
-        ds = load_dataset(
-            "mozilla-foundation/common_voice_6_1",
-            "ja",
-            split="test",
-            streaming=True,
-            token=token,
-            trust_remote_code=True,
-        )
-        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
-
-        input_speech = next(iter(ds))["audio"]["array"]
-        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="ms").input_features
-
-        EXPECTED_TRANSCRIPTS = ["木村さんに電話を貸してもらいました", " Kimura-san called me."]
-
-        generated_ids = model.generate(
-            input_features.tile((2, 1, 1)),
-            do_sample=False,
-            max_length=20,
-            language=["<|ja|>", "<|en|>"],
-            task="transcribe",
-        )
-        transcripts = processor.batch_decode(generated_ids, skip_special_tokens=True)
-        self.assertEqual(transcripts, EXPECTED_TRANSCRIPTS)
-
-    @slow
-    def test_tiny_en_batched_generation(self):
-        set_seed(2345)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-
-        input_speech = self._load_datasamples(4)
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-        generated_ids = model.generate(input_features, max_length=20)
-
-        # fmt: off
-        EXPECTED_LOGITS = mindspore.tensor(
-            [
-                [50257, 50362, 1770, 13, 2264, 346, 353, 318, 262, 46329, 286, 262, 3504, 6097, 11, 290, 356, 389, 9675, 284],
-                [50257, 50362, 5414, 318, 1770, 13, 2264, 346, 353, 338, 5642, 1342, 3499, 621, 465, 2300, 13, 50256, 50256, 50256],
-                [50257, 50362, 679, 4952, 514, 326, 379, 428, 43856, 1622, 286, 262, 614, 11, 351, 6786, 290, 32595, 12023, 28236],
-                [50257, 50362, 679, 468, 12296, 17188, 1771, 7361, 26113, 18881, 1122, 338, 670, 318, 1107, 8312, 706, 477, 290, 460]
-            ]
-
-        )
-        # fmt: on
-        print(generated_ids)
-        self.assertTrue(ops.allclose(generated_ids, EXPECTED_LOGITS))
-
-        # fmt: off
-        EXPECTED_TRANSCRIPT = [
-            " Mr. Quilter is the apostle of the middle classes, and we are glad to",
-            " Nor is Mr. Quilter's manner less interesting than his matter.",
-            " He tells us that at this festive season of the year, with Christmas and roast beef looming",
-            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can",
-        ]
-        # fmt: on
-
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
-        self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    @slow
-    def test_tiny_timestamp_generation(self):
-        set_seed(2345)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-
-        input_speech = np.concatenate(self._load_datasamples(4))
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-
-        generated_ids = model.generate(input_features, max_length=448, return_timestamps=True)
-
-        EXPECTED_OUTPUT = mindspore.tensor([50258, 50259, 50359, 50364, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 11, 293, 321, 366, 5404, 281, 2928, 702, 14943, 13, 50692, 50692, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50926, 50926, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256, 450, 10539, 51208, 51208, 949, 505, 11, 14138, 10117, 490, 3936, 293, 1080, 3542, 5160, 881, 26336, 281, 264, 1575, 13, 51552, 51552, 634, 575, 12525, 22618, 1968, 6144, 35617, 7354, 1292, 6, 589, 307, 534, 10281, 934, 439, 11, 293, 51836, 51836, 50257])  # fmt: skip
-
-        self.assertTrue(ops.allclose(generated_ids, EXPECTED_OUTPUT))
-
-        EXPECTED_TRANSCRIPT = [
-            {
-                "text": (
-                    " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is"
-                    " Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season"
-                    " of the year, with Christmas and roast beef looming before us, similarly drawn from eating and"
-                    " its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins'"
-                    " work is really Greek after all, and"
-                ),
-                "offsets": [
-                    {
-                        "text": (
-                            " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
-                        ),
-                        "timestamp": (0.0, 6.5600000000000005),
-                    },
-                    {
-                        "text": " Nor is Mr. Quilter's manner less interesting than his matter.",
-                        "timestamp": (6.5600000000000005, 11.24),
-                    },
-                    {
-                        "text": (
-                            " He tells us that at this festive season of the year, with Christmas and roast beef"
-                            " looming"
-                        ),
-                        "timestamp": (11.24, 16.88),
-                    },
-                    {
-                        "text": (
-                            " before us, similarly drawn from eating and its results occur most readily to the mind."
-                        ),
-                        "timestamp": (16.88, 23.76),
-                    },
-                    {
-                        "text": (
-                            " He has grave doubts whether Sir Frederick Latins' work is really Greek after all, and"
-                        ),
-                        "timestamp": (23.76, 29.44),
-                    },
-                ],
-            }
-        ]
-
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True)
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    @slow
-    def test_large_timestamp_generation(self):
-        set_seed(2345)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
-
-        input_speech = np.concatenate(self._load_datasamples(4))
-        input_features = processor(
-            input_speech, return_tensors="ms", sampling_rate=16_000, return_token_timestamps=True
-        ).input_features
-
-        generated_ids = model.generate(input_features, max_length=448, return_timestamps=True)
-
-        # fmt: off
-        EXPECTED_OUTPUT = mindspore.tensor([50258, 50259, 50360, 50365, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 11, 293, 321, 366, 5404, 281, 2928, 702, 14943, 13, 50629, 50682, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702,  1871, 13, 50870, 50911, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264,  1064,  11, 365,  5272,   293, 12904,  9256, 450, 10539, 949, 505, 11, 51245, 51287,  1034, 4680, 10117, 490, 3936, 293, 1080,  3542, 5160, 881, 26336, 281, 264, 1575, 13, 51494, 51523, 634, 575, 12525, 22618, 1968,  6144, 35617, 1456, 397, 266, 311, 589, 307, 534, 10281, 934, 439, 11, 51799, 51815, 50257])
-        # fmt: on
-        print(generated_ids)
-        self.assertTrue(ops.allclose(generated_ids, EXPECTED_OUTPUT))
-
-        EXPECTED_TRANSCRIPT = [
-            {
-                "text": (
-                    " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
-                    " Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive"
-                    " season of the year, with Christmas and roast beef looming before us, similes drawn from eating"
-                    " and its results occur most readily to the mind. He has grave doubts whether Sir Frederick "
-                    "Leighton's work is really Greek after all,"
-                ),
-                "offsets": [
-                    {
-                        "text": (
-                            " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
-                        ),
-                        "timestamp": (0.0, 5.28),
-                    },
-                    {
-                        "text": " Nor is Mr. Quilter's manner less interesting than his matter.",
-                        "timestamp": (6.34, 10.1),
-                    },
-                    {
-                        "text": (
-                            " He tells us that at this festive season of the year, with Christmas and roast beef looming before us,"
-                        ),
-                        "timestamp": (10.92, 17.6),
-                    },
-                    {
-                        "text": (" similes drawn from eating and its results occur most readily to the mind."),
-                        "timestamp": (18.44, 22.580000000000002),
-                    },
-                    {
-                        "text": (
-                            " He has grave doubts whether Sir Frederick Leighton's work is really Greek after all,"
-                        ),
-                        "timestamp": (23.16, 28.68),
-                    },
-                ],
-            }
-        ]
-
-        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True)
-        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-    @slow
-    def test_tiny_token_timestamp_generation(self):
-        set_seed(2345)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-        model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
-
-        input_speech = self._load_datasamples(4)
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-
-        generate_outputs = model.generate(
-            input_features, max_length=448, return_timestamps=True, return_token_timestamps=True
-        )
-
-        self.assertEqual(generate_outputs.sequences.shape, generate_outputs.token_timestamps.shape)
-
-        # fmt: off
-        EXPECTED_OUTPUT = mindspore.tensor([
-            [ 0.0000, 0.0000, 0.0000, 0.0000, 0.4800, 0.8200, 0.9600, 1.1200, 1.1200, 1.2200, 1.5000, 1.7200, 2.0000, 2.3400, 2.5000, 2.6600, 3.1800, 3.5600, 3.6800, 3.8000, 4.1000, 4.3000, 4.5800, 4.9400, 5.3800, 12.4200, 12.8400, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9200, 26.9400, 26.9400, 26.9400, 26.9400, 29.8400 ],
-            [ 0.0000, 0.0000, 0.0000, 0.0000, 0.5200, 0.9000, 1.1400, 1.4200, 1.5200, 1.6800, 1.6800, 1.8800, 2.1000, 2.2200, 2.6200, 3.1400, 3.5800, 3.9600, 4.4000, 17.3000, 17.3000, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7200, 26.7400, 26.7400, 26.7400, 26.7400, 26.7400, 26.7400, 28.0000 ],
-            [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.7600, 1.0000, 1.4200, 1.8000, 1.9400, 2.1800, 2.5200, 3.0200, 3.3200, 3.5400, 3.9400, 4.5600, 4.9200, 5.2800, 5.5600, 5.9000, 6.1600, 6.3000, 6.4800, 6.4800, 6.6400, 7.8200, 7.9600, 8.2200, 8.6000, 8.9200, 9.2200, 9.5200, 9.7200, 10.0600, 10.5400, 10.8800, 11.2600, 11.5400, 11.7400, 12.0800, 15.6800, 15.6800],
-            [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.7400, 1.0400, 1.3200, 1.6800, 2.1400, 2.4800, 2.7800, 3.0800, 3.1600, 3.4000, 3.6000, 4.0200, 4.2200, 4.8600, 5.2400, 5.7400, 6.3400, 6.6200, 6.7600, 6.7600, 6.8600, 7.2400, 7.4200, 7.6800, 7.9200, 8.4800, 8.7600, 9.2000, 9.2000, 9.4200, 15.8200, 15.8200, 29.6400, 29.6600, 29.6600, 29.6600, 29.6600, 29.7600]
-        ])
-        # fmt: on
-
-        self.assertTrue(ops.allclose(generate_outputs.token_timestamps, EXPECTED_OUTPUT))
-
-    @slow
-    def test_large_token_timestamp_generation(self):
-        set_seed(2345)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
-
-        input_speech = self._load_datasamples(4)
-        input_features = processor(
-            input_speech, return_tensors="ms", sampling_rate=16_000, return_token_timestamps=True
-        )
-
-        generate_outputs = model.generate(
-            **input_features, max_length=448, return_timestamps=True, return_token_timestamps=True
-        )
-
-        self.assertEqual(generate_outputs.sequences.shape, generate_outputs.token_timestamps.shape)
-
-        # fmt: off
-        EXPECTED_OUTPUT = mindspore.tensor([
-            [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.6200,  0.7400,  0.8600, 1.0000,  1.0400,  1.3000,  1.4400,  1.7800,  2.1800,  2.2800,  2.5000, 2.9200,  3.0000,  3.3800,  3.5000,  3.6000,  3.8400,  4.1000,  4.4000, 4.6800,  5.1400,  5.3600,  5.8200,  5.8200,  5.8200,  5.8200,  5.8200, 5.8200,  5.8200,  5.8200,  5.8200,  5.8200,  5.8200,  5.8200,  5.8200, 5.8200],
-            [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.6000,  0.9200,  1.2200, 1.3400,  1.4200,  1.5400,  1.5800,  1.7400,  2.0600,  2.3800,  3.0400, 3.3800,  3.6400,  4.1200,  4.3600,  4.7800,  4.7800,  4.7800,  4.7800, 4.7800,  4.7800,  4.7800,  4.7800,  4.7800,  4.7800,  4.7800,  4.7800, 4.7800,  4.7800,  4.7800,  4.7800,  4.7800,  4.7800,  4.7800,  4.7800, 4.7800],
-            [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.5400,  0.8200,  1.1600, 1.4600,  1.7400,  1.8800,  2.3400,  2.7400,  3.1400,  3.2200,  3.5400, 4.2800,  4.5600,  4.8200,  5.0600,  5.3200,  5.6600,  5.9600,  6.1400, 6.4000,  6.8400,  7.8800,  8.0200,  8.3600,  8.7000,  9.0200,  9.3200, 9.5000,  9.8400, 10.3000, 10.6600, 11.0800, 11.3600, 11.4600, 11.8000, 12.4600],
-            [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.5600,  0.7600,  1.0600, 1.4000,  1.8800,  2.2600,  2.6200,  2.8000,  2.9600,  3.0000,  3.2000, 3.4400,  3.6800,  4.0000,  4.6000,  5.0000,  5.3200,  5.4800,  6.0600, 6.0600,  6.1000,  6.3200,  6.7400,  7.0000,  7.2200,  7.4000,  7.7600, 8.0600,  8.5600,  8.8600,  8.9400,  9.1000,  9.3400,  9.8800,  9.8800, 9.8800]
-        ])
-        # fmt: on
-
-        self.assertTrue(ops.allclose(generate_outputs.token_timestamps, EXPECTED_OUTPUT))
-
-    @slow
-    def test_tiny_token_timestamp_batch_generation(self):
-        set_seed(2345)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-        model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
-
-        num_samples = 4
-        num_return_sequences = 2
-
-        input_speech = self._load_datasamples(num_samples)
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-
-        generate_outputs = model.generate(
-            input_features,
-            max_length=448,
-            return_timestamps=True,
-            return_token_timestamps=True,
-            num_beams=3,
-            num_return_sequences=num_return_sequences,
-        )
-
-        # task id and lang id prompts should not have timestamp tokens
-        self.assertEqual(generate_outputs.sequences.shape[-1] - 2, generate_outputs.token_timestamps.shape[-1])
-
-        self.assertEqual(len(generate_outputs.sequences), num_return_sequences * num_samples)
-
-    @slow
-    def test_tiny_token_timestamp_generation_longform(self):
-        set_seed(2345)
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-        model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
-
-        input_speech = self._load_datasamples(5)
-        long_input_speech = np.concatenate(input_speech, dtype=np.float32)
-        inputs = processor(
-            long_input_speech,
-            return_tensors="ms",
-            truncation=False,  # False so the audio isn't truncated and whole audio is sent to the model
-            return_attention_mask=True,
-            padding=True,
-        )
-
-        generate_outputs = model.generate(**inputs, return_segments=True, return_token_timestamps=True)
-
-        token_timestamps_shape = [
-            [segment["token_timestamps"].shape for segment in segment_list]
-            for segment_list in generate_outputs["segments"]
-        ]
-        tokens_shape = [
-            [segment["tokens"].shape for segment in segment_list] for segment_list in generate_outputs["segments"]
-        ]
-        self.assertListEqual(tokens_shape, token_timestamps_shape)
-
-        # fmt: off
-        EXPECTED_OUTPUT = [
-            mindspore.tensor([0.0000, 0.4200, 0.8200, 0.9400, 1.1200, 1.1200, 1.2200, 1.5000, 1.7200, 2.0400, 2.3400, 2.5200, 2.6600, 3.2000, 3.4400, 3.5600, 3.6800, 3.8200, 4.1000, 4.3000, 4.5800, 4.9400, 5.4000, 6.3600]),
-            mindspore.tensor([ 6.5400,  6.5400,  6.7400,  6.9600,  7.2600,  7.3400,  7.5800,  7.5800, 7.6400,  7.8400,  8.1000,  8.5000,  9.0000,  9.4800,  9.7200, 10.2600, 11.1000]),
-            mindspore.tensor([11.2200, 11.2200, 11.4200, 11.6600, 12.0800, 12.4400, 12.5800, 12.8400, 13.1800, 13.6800, 14.0000, 14.2200, 14.6200, 14.9800, 15.2200, 15.6000, 15.9400, 16.2000, 16.5600, 16.8400, 16.9800]),
-            mindspore.tensor([16.9800, 16.9800, 17.3200, 18.1600, 18.6400, 18.8600, 19.2800, 19.5600, 19.8800, 20.1800, 20.3800, 20.7200, 21.1600, 21.5400, 21.9000, 22.2000, 22.4200, 22.8600, 23.7000]),
-            mindspore.tensor([23.7000, 23.7000, 23.9400, 24.1800, 24.3800, 24.8400, 25.2800, 25.6600, 25.9200, 26.2600, 26.4000, 26.5800, 26.7600, 27.1400, 27.3800, 28.0400, 28.3800, 28.8200, 29.3400, 29.5200]),
-            mindspore.tensor([29.4400, 29.4400, 29.7000, 30.0800, 30.3800, 30.5400, 30.8200, 31.0600, 31.6600, 31.9200, 32.3000, 32.4800, 32.6200, 33.6800]),
-            mindspore.tensor([33.8000, 33.8000, 33.9800, 33.9800, 34.1800, 34.4400, 34.6200, 35.0000, 35.2200, 35.3200, 35.5600, 35.9200, 36.3800, 36.6200, 36.6600, 36.9600, 37.3400, 37.9800, 38.5800, 38.7200, 38.9800, 39.4400, 39.5800, 39.8000, 40.1200, 40.2600]),
-            mindspore.tensor([40.5200, 40.5200, 40.6200, 41.1000, 41.5400, 41.9200, 42.1000, 42.3200, 42.3200, 43.0600, 44.6000]),
-            mindspore.tensor([44.7000, 44.7000, 44.8600, 44.9400, 45.1400, 45.1400, 45.2800, 45.6200, 45.9000, 46.2600, 47.1600, 47.4800, 47.7400, 48.1000, 48.2800, 48.4000, 48.6200, 48.8400, 49.0400, 49.2800, 49.4800, 49.6600, 49.9400, 50.5400]),
-            mindspore.tensor([50.5400, 50.5400, 50.6600, 50.8800, 51.2400, 51.7200, 52.8400]),
-            mindspore.tensor([52.9600, 52.9600, 53.0400, 53.2600, 53.4200, 53.5800, 53.9200, 54.1200, 54.7200, 54.9400, 55.2600, 55.6200, 55.9800, 56.5600, 56.8000, 56.9200, 57.3600, 57.9200, 58.1800, 58.5000, 58.6400, 58.8200]),
-            mindspore.tensor([58.6800, 58.6800, 59.1400, 59.5400, 59.9200, 60.1600, 60.3800, 60.8200, 61.6200, 62.2600, 75.2000]),
-        ]
-        # fmt: on
-
-        for segment, exp_segment in zip(generate_outputs["segments"][0], EXPECTED_OUTPUT):
-            self.assertTrue(ops.allclose(segment["token_timestamps"], exp_segment))
-
-    @slow
-    def test_tiny_specaugment_librispeech(self):
-        set_seed(2345)
-        # Apply SpecAugment
-        model = WhisperModel.from_pretrained("openai/whisper-tiny", apply_spec_augment=True)
-        # Set model to training mode to enable SpecAugment
-        model.train()
-        input_speech = self._load_datasamples(1)
-        feature_extractor = WhisperFeatureExtractor()
-        input_features = feature_extractor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-
-        with no_grad():
-            logits = model(
-                input_features,
-                decoder_input_ids=mindspore.tensor([[50258, 50259, 50359]]),
-                output_hidden_states=False,
-                output_attentions=False,
-                return_dict=False,
-                use_cache=False,
-            )
-
-        # fmt: off
-        EXPECTED_LOGITS = mindspore.tensor(
-            [
-                0.9362, -4.7105, 5.0879, 3.9642, 1.0013, -6.0096, 4.7285, -3.1847,
-                -0.8648, 1.9631, 6.2653, 3.6936, 0.3575, -4.5818, 3.0564, 7.8712,
-                2.9951, 0.6848, 9.9497, -2.6638, 1.1571, -6.8546, -1.4333, -7.7584,
-                1.1200, 3.9030, 4.4655, -4.4919, -1.1703, 9.6241
-            ]
-        )
-        # fmt: on
-        self.assertTrue(ops.allclose(logits[0][0, 0, :30], EXPECTED_LOGITS, atol=1e-4))
-
-    @slow
-    def test_generate_with_prompt_ids(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-        input_speech = self._load_datasamples(4)[-1:]
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-
-        output_without_prompt = model.generate(input_features)
-        prompt_ids = processor.get_prompt_ids("Leighton", return_tensors="ms")
-        output_with_prompt = model.generate(input_features, prompt_ids=prompt_ids)
-
-        expected_without_prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Ithaca.<|endoftext|>"
-        expected_with_prompt = "<|startofprev|> Leighton<|startoftranscript|><|en|><|transcribe|><|notimestamps|> He has grave doubts whether Sir Frederick Leighton's work is really Greek after all and can discover in it but little of Rocky Ithaca.<|endoftext|>"
-
-        output_without_prompt = processor.decode(output_without_prompt[0])
-        output_with_prompt = processor.decode(output_with_prompt[0])
-
-        self.assertEqual(output_with_prompt, expected_with_prompt)
-        self.assertEqual(output_without_prompt, expected_without_prompt)
-
-    @slow
-    def test_language_detection(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-        input_speech = self._load_datasamples(4)[-1:]
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-
-        lang_id = model.detect_language(input_features)[0].item()
-
-        ids_to_lang = {v: k for k, v in model.generation_config.lang_to_id.items()}
-
-        assert ids_to_lang[lang_id] == "<|en|>"
-
-        audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
-
-        raw_audio, sr = read(audio)
-        input_speech = Resample(sr, 16_000)(raw_audio)
-
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-
-        lang_id = model.detect_language(input_features)[0].item()
-
-        assert ids_to_lang[lang_id] == "<|hi|>"
-
-    @slow
-    def test_default_multilingual_transcription_short_form(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-
-        audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
-
-        raw_audio, sr = read(audio)
-        input_speech = Resample(sr, 16_000)(raw_audio)
-
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-
-        # task defaults to transcribe
-        sequences = model.generate(input_features)
-
-        transcription = processor.batch_decode(sequences, skip_special_tokens=False)[0]
-
-        assert (
-            transcription
-            == "<|startoftranscript|><|hi|><|transcribe|><|notimestamps|> Mirchi mein ki tene vibinda prajatiya hai<|endoftext|>"
-        )
-
-        # set task to translate
-        sequences = model.generate(input_features, task="translate")
-        transcription = processor.batch_decode(sequences, skip_special_tokens=False)[0]
-
-        assert (
-            transcription
-            == "<|startoftranscript|><|hi|><|translate|><|notimestamps|> How much is the difference between the girls?<|endoftext|>"
-        )
-
-    @slow
-    def test_default_multilingual_transcription_long_form(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
-
-        audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
-
-        raw_audio, sr = read(audio)
-        input_speech = Resample(sr, 16_000)(raw_audio)
-
-        input_speech = input_speech.tile((1, 10))
-        input_features = processor(
-            input_speech, return_tensors="ms", padding="longest", truncation=False, sampling_rate=16_000
-        ).input_features
-
-        # task defaults to transcribe
-        sequences = model.generate(input_features)
-
-        transcription = processor.batch_decode(sequences)[0]
-
-        assert transcription == " मिर्ची में कितने विबिन्द प्रजातियां हैं? मिर्ची में कितने विबिन्द प्रजातियां हैं?"
-
-        # set task to translate
-        sequences = model.generate(input_features, task="translate")
-        transcription = processor.batch_decode(sequences)[0]
-
-        assert (
-            transcription
-            == " How many different species are there in the chilli? How many different species are there in the chilli?"
-        )
-
-    @slow
-    def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-        input_speech = self._load_datasamples(1)
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-        task = "translate"
-        language = "de"
-        expected_tokens = [f"<|{task}|>", f"<|{language}|>"]
-        prompt = "test prompt"
-        prompt_ids = processor.get_prompt_ids(prompt, return_tensors="ms")
-
-        output = model.generate(input_features, task=task, language=language, prompt_ids=prompt_ids)
-        text = processor.decode(output[0])
-
-        self.assertTrue(prompt in text)
-        self.assertTrue(all(token in text for token in expected_tokens))
-
-    @slow
-    def test_generate_with_prompt_ids_and_no_non_prompt_forced_decoder_ids(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-        input_speech = self._load_datasamples(1)
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-        prompt = "test prompt"
-        prompt_ids = processor.get_prompt_ids(prompt, return_tensors="ms")
-
-        model.generation_config.forced_decoder_ids = None
-        model.config.forced_decoder_ids = None
-
-        output = model.generate(input_features, prompt_ids=prompt_ids, return_timestamps=True)
-        text = processor.decode(output[0])
-
-        self.assertTrue(prompt in text)
-
-    @slow
-    def test_speculative_decoding_distil(self):
-        ms_dtype = mindspore.float16
-        model_id = "openai/whisper-large-v2"
-        model = WhisperForConditionalGeneration.from_pretrained(
-            model_id, ms_dtype=ms_dtype, low_cpu_mem_usage=True, use_safetensors=True
-        )
-
-        processor = WhisperProcessor.from_pretrained(model_id)
-
-        assistant_model_id = "distil-whisper/distil-large-v2"
-        assistant_model = WhisperForCausalLM.from_pretrained(
-            assistant_model_id, ms_dtype=ms_dtype, low_cpu_mem_usage=True, use_safetensors=True
-        )
-
-        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        sample = dataset[0]["audio"]
-
-        input_features = processor(sample["array"], return_tensors="ms", sampling_rate=16_000).input_features
-        input_features = input_features.to(dtype=mindspore.float16)
-
-        # warm up assisted decoding
-        _ = model.generate(input_features, assistant_model=assistant_model)
-        # warm up non-assisted decoding
-        _ = model.generate(input_features)
-
-        # assisted decoding
-        start_time = time.time()
-        tokens = model.generate(input_features, assistant_model=assistant_model)
-        total_time_assist = time.time() - start_time
-
-        transcription_ass = processor.batch_decode(tokens, skip_special_tokens=True)
-
-        # non-assisted decoding
-        start_time = time.time()
-        tokens = model.generate(input_features)
-        total_time_non_assist = time.time() - start_time
-
-        transcription_non_ass = processor.batch_decode(tokens, skip_special_tokens=True)
-
-        assert transcription_ass == transcription_non_ass
-        assert transcription_ass == [
-            " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."
-        ]
-        assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster"
-
-    @slow
-    def test_speculative_decoding_non_distil(self):
-        ms_dtype = mindspore.float16
-        model_id = "openai/whisper-large-v2"
-        model = WhisperForConditionalGeneration.from_pretrained(
-            model_id, ms_dtype=ms_dtype, low_cpu_mem_usage=True, use_safetensors=True
-        )
-
-        processor = WhisperProcessor.from_pretrained(model_id)
-
-        assistant_model_id = "openai/whisper-tiny"
-        assistant_model = WhisperForConditionalGeneration.from_pretrained(
-            assistant_model_id, ms_dtype=ms_dtype, low_cpu_mem_usage=True, use_safetensors=True
-        )
-
-        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        sample = dataset[0]["audio"]
-
-        input_features = processor(sample["array"], return_tensors="ms", sampling_rate=16_000).input_features
-        input_features = input_features.to(mindspore.float16)
-
-        # warm up assisted decoding
-        _ = model.generate(input_features, assistant_model=assistant_model)
-        # warm up non-assisted decoding
-        _ = model.generate(input_features)
-
-        # assisted decoding
-        start_time = time.time()
-        tokens = model.generate(input_features, assistant_model=assistant_model)
-        total_time_assist = time.time() - start_time
-
-        transcription_ass = processor.batch_decode(tokens, skip_special_tokens=True)
-
-        # non-assisted decoding
-        start_time = time.time()
-        tokens = model.generate(input_features)
-        total_time_non_assist = time.time() - start_time
-
-        transcription_non_ass = processor.batch_decode(tokens, skip_special_tokens=True)
-
-        assert transcription_ass == transcription_non_ass
-        assert transcription_ass == [
-            " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."
-        ]
-        assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster"
-
-    @slow
-    def test_whisper_longform_single_batch(self):
-        # fmt: off
-        EXPECTED_TEXT = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Birk at Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampoo or a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. he tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Makes the customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing upholsterer. Near the fire, any ornaments Fred brought home from India on the mantelboard. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man. And remarks was pleasing courtesy in Felicitis Grace that many faces are feeling. Only, unfortunately, his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the Tupper of painting. By Harry Quilter M.A. Because you were sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accoing dove. He has gone and gone for good, answered Polychrome, would manage to squeeze into the room beside the dragon and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled and disgraced and your friends are asking for you. I begged Ruggadot long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions, as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now? In Quared Shaggy. In the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked Betsy thoughtfully. I don't believe and knew any magic or she'd have worked it before. I do not know, confess shaggy. True, a great calico. Calico went to the big gong and pounded on it just as we're good to use to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the throne, wearing ruggedos discarded ruby crown and holding in his hand to scepter which ruggedo had so often thrown at his head. A man said to the universe, Sir, I exist. Sweat covered Breon's body, trickling into the titling cloth that was the only german he wore. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were triggered as muscles into complete relaxation. Oli's heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, The thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenties, he must have drawn his gun because the intruder said quickly, but that away you're being a fool. out, there was silence then, and still wondering, Breon was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man, with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry, and victory to the stronger. a man who entered the twenties had his own training tricks. They were appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. had died before during the 20s and death during the last round was in some ways easier than defeat. Breathing deeply, Breon's softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our rogue. Breon sensed it and knew the fifth point was his. the powerful twist that's rest of the side, in and under the guard."]
-        # fmt: on
-
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
-        one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
-
-        input_features = processor(
-            one_audio, return_tensors="ms", truncation=False, padding="longest", sampling_rate=16_000
-        )["input_features"]
-
-        result = model.generate(input_features, return_timestamps=True)
-        decoded = processor.batch_decode(result, skip_special_tokens=True)
-
-        assert decoded == EXPECTED_TEXT
-
-        decoded_with_timestamps = processor.batch_decode(result, skip_special_tokens=True, decode_with_timestamps=True)
-
-        no_timestamp_matches = re.split(r"<\|[\d\.]+\|>", decoded_with_timestamps[0])
-
-        assert ["".join(no_timestamp_matches)] == EXPECTED_TEXT
-
-        timestamp_matches = re.findall(r"<\|[\d\.]+\|>", decoded_with_timestamps[0])
-
-        timestamp_floats = [float(t[2:-2]) for t in timestamp_matches]
-
-        is_increasing = all(timestamp_floats[i] <= timestamp_floats[i + 1] for i in range(len(timestamp_floats) - 1))
-
-        assert is_increasing
-
-    @slow
-    def test_whisper_longform_prompt_ids(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-
-        prompt = "Mr. Kilter, Brionno."  # let's force Quilter -> Kilter, Brion -> Brionno
-        prompt_ids = processor.get_prompt_ids(prompt, return_tensors="ms")
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:-1]")
-        one_audio = np.concatenate([x["array"] for x in ds["audio"]], dtype=np.float32)
-
-        first_text = ds[0]["text"].lower()
-        last_text = ds[-1]["text"].lower()
-
-        input_features = processor(
-            one_audio, return_tensors="ms", truncation=False, padding="longest", sampling_rate=16_000
-        )["input_features"]
-
-        result = model.generate(
-            input_features,
-            prompt_ids=prompt_ids,
-            return_timestamps=True,
-            prompt_condition_type="first-segment",
-            condition_on_prev_tokens=True,
-        )
-        decoded_first_segment = processor.batch_decode(result, skip_special_tokens=True)
-
-        result = model.generate(
-            input_features,
-            prompt_ids=prompt_ids,
-            return_timestamps=True,
-            prompt_condition_type="all-segments",
-            condition_on_prev_tokens=True,
-        )
-        decoded_all_segments = processor.batch_decode(result, skip_special_tokens=True)
-
-        # show that first segment has quilter and last segment has brion
-        assert "quilter" in first_text
-        assert "brion" in last_text
-
-        # condition on first segment correctly changes to kilter in first segment, but does not transcribe "brianno" correctly
-        assert "kilter" in decoded_first_segment[0][: len(first_text)].lower()
-        assert "brionno" not in decoded_first_segment[0][-len(last_text) :].lower()
-
-        # condition on all-segment correctly changes to kilter in first segment and correctly transcribes "brianno"
-        assert "kilter" in decoded_all_segments[0][: len(first_text)].lower()
-        assert "brionno" in decoded_all_segments[0][-len(last_text) :].lower()
-
-    @slow
-    def test_whisper_longform_single_batch_prev_cond(self):
-        # fmt: off
-        EXPECTED_TEXT = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grieved doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite itals are as national as a jingo poem. Mr. Birk at Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. When Mr. John Collier gives his sitter a cheerful slap in the back, before he says like a shampooer and a Turkish bath, next man it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. He tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing upholsterer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man, and remarks was pleasing courtesy in felicitous grace that many faces are feeling. Unfortunately his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the tupper of painting. By Harry Quilter M.A. because he was sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accooing dove. He has gone and gone for good. answered Polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled and disgraced and your friends are asking for you. I begged Ruggido long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there is nothing he can do in these dominions, as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now, inquired Shaggy. In the metal forest. Where is that? The metal forest is in the great domed cavern, the largest in all our dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. It's funny, remarked Betsy thoughtfully. I don't believe and knew any magic, or she'd have worked it before. I do not know, confessed Shaggy. True, agreed Calico. Calico went to the big gong and pounded on it, just as Ruggido used to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gong and then sat in the throne, wearing Ruggido's discarded ruby crown. And holding it in his hand, the scepter which Ruggido had so often thrown at his head. A man said to the universe, Sir, I exist. Sweat covered Breon's body, trickling into the titling cloth that was the only german he wore. The cut on his chest, still dripping blood. The ache of his overstrained eyes, even to soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were triggered as muscles into complete relaxation. Only his heart and lungs worked on at a strong measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the twenties needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenties, he must have drawn his gun because the intruder said quickly, but that away you're being a fool. Out there was silence then, and still wondering, Breon was once more asleep. In seconds he asked the handler who was needing his aching muscles. A red-haired mountain of a man with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry and victory to the stronger. Every man who entered the twenties had his own training tricks. There appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported, except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the twenties and death during the last round was, in some ways, easier than defeat. In deeply, Breon softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our rogue. Breon sensed it and knew the fifth point was his. Then the powerful twist that's rested aside, in and under the guard."]
-        # fmt: on
-
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
-        one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
-
-        input_features = processor(
-            one_audio, return_tensors="ms", truncation=False, padding="longest", sampling_rate=16_000
-        )["input_features"]
-
-        gen_kwargs = {
-            "return_timestamps": True,
-            "no_speech_threshold": 0.6,
-            "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
-            "compression_ratio_threshold": 1.35,
-            "condition_on_prev_tokens": True,
-            "logprob_threshold": -1.0,
-        }
-
-        set_seed(2345)
-        result = model.generate(input_features, **gen_kwargs)
-        decoded = processor.batch_decode(result, skip_special_tokens=True)
-
-        assert decoded == EXPECTED_TEXT
-
-    @slow
-    def test_whisper_shortform_single_batch_prev_cond(self):
-        # fmt: off
-        EXPECTED_TEXT = [" Folks, I spend a lot of time right over there, night after night, actually. Carefully selecting for you the day's newsiest, most aerodynamic headlines, stress testing and the most topical antilock breaks and power steering pain, Stakingly stitching, leather seating so soft, it would make JD power and her associate blush. If you were to create the luxury sedan that is my nightly model, but sometimes— you're sometimes, folks— I lurched the consciousness and the back of an abandoned school bus"]
-        EXPECTED_TEXT1 = [" Folks, I spend a lot of time right over there night after night after, actually. Carefully selecting for you the day's noisiest, most aerodynamic headlines, stress testing, and the most topical, anti-lock breaks and power steering, painstakingly stitching, leather seating, so soft, it would make JD power and her associates blush to create the luxury sedan that is my nightly monologue. But sometimes, you sometimes, folks. I lurched a consciousness in the back of an abandoned school"]
-        # fmt: on
-
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-
-        ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
-        dataset = ds.cast_column("audio", Audio(sampling_rate=16000))
-
-        one_audio = dataset[1]["audio"]["array"]
-
-        input_features = processor(one_audio, return_tensors="ms", sampling_rate=16_000)["input_features"]
-
-        gen_kwargs = {
-            "return_timestamps": True,
-            "no_speech_threshold": 0.6,
-            "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
-            "compression_ratio_threshold": 1.35,
-            "condition_on_prev_tokens": True,
-            "logprob_threshold": -1.0,
-        }
-
-        set_seed(2345)
-        result = model.generate(input_features, **gen_kwargs)
-        decoded = processor.batch_decode(result.sequences, skip_special_tokens=True)
-
-        assert decoded == EXPECTED_TEXT
-
-        gen_kwargs = {
-            "return_timestamps": True,
-            "no_speech_threshold": 0.3,
-            "temperature": (0.0, 0.2),
-            "compression_ratio_threshold": 1,
-            "condition_on_prev_tokens": False,
-            "logprob_threshold": -1.0,
-        }
-
-        set_seed(2345)
-        result = model.generate(input_features, **gen_kwargs)
-        decoded = processor.batch_decode(result.sequences, skip_special_tokens=True)
-
-        assert decoded == EXPECTED_TEXT1
-
-    @slow
-    def test_whisper_longform_single_batch_beam(self):
-        # fmt: off
-        EXPECTED_TEXT = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Burkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. When Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampooer and a Turkish bath, next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. He tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Mix a customary appeal to the last judgment and reminds us that in the great days of art with Michelangelo was the furnishing upholsterer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man, and remarks was pleasing courtesy in felicitous grace that many faces are feeling. Only, unfortunately, his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the topper of painting. By Harry Quilter, M.A., because he was sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accooing dove. He has gone and gone for good, answered polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has flooded this grace, and your friends are asking for you. I begged Ruggado long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions, as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now, inquired Shaggy. In the metal forest. Where is that? The metal forest is in the great domed cavern, the largest in all our dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked Betsy thoughtfully. I don't believe and knew any magic, or she'd have worked it before. I do not know, confessed Shaggy. True, a great Calico. Calico went to the big gong and pounded on it, just as Ruggado used to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the throne, wearing Ruggado's discarded ruby crown, and holding in his hand to scepter which Ruggado had so often thrown at his head. A man said to the universe, Sir, I exist. Sweat covered Breon's body, trickling into the tight-laying cloth that was the only german who wore. The cut on his chest was still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small, sharp, blow high on his chest. One minute, a voice said, and a time buzzer sounded, a minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were, triggered his muscles into complete relaxation. Oli's heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the twenties needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenties, he must have drawn his gun because the intruder said quickly, but that away you're being a fool. Out there was silence then, and still wondering, Breon was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry and victory to the stronger. Every man who entered the twenties had his own training tricks. There appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. Breon's head died before during the twenties and death during the last round was, in some ways, easier than defeat. Breeding deeply, Breon's softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our rogue. Breon sensed it and knew the fifth point was his. In the powerful twist that's rest of the side, in and under the guard."]
-        # fmt: on
-
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
-        one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
-
-        input_features = processor(
-            one_audio, return_tensors="ms", truncation=False, padding="longest", sampling_rate=16_000
-        )["input_features"]
-
-        gen_kwargs = {
-            "return_timestamps": True,
-            "no_speech_threshold": 0.6,
-            "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
-            "num_beams": 2,
-            "compression_ratio_threshold": 1.35,
-            "condition_on_prev_tokens": True,
-            "logprob_threshold": -1.0,
-        }
-
-        def check_gen_kwargs(inputs, generation_config, *args, **kwargs):
-            assert generation_config.num_beams == gen_kwargs["num_beams"]
-
-        self._patch_generation_mixin_generate(check_args_fn=check_gen_kwargs)
-
-        set_seed(2345)
-        result = model.generate(input_features, **gen_kwargs)
-        decoded = processor.batch_decode(result, skip_special_tokens=True)
-
-        assert decoded == EXPECTED_TEXT
-
-    @slow
-    def test_whisper_longform_multi_batch(self):
-        # fmt: off
-        EXPECTED_TEXT_1 = [" Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Birkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampooer and a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. Painting he tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Mix a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing a poster or near the fire, and the ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man. And remarks was pleasing courtesy in Felicitis Grace that many faces are feeling. Only unfortunately his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the Tupper of painting. a Harry Quilter M.A. Because you were sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accooing dove. He has gone, and gone for good, answered Polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has flooded disgrace, and your friends are asking for you. I begged Ruggadot a long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, St. Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now, inquired Shaggy. In the metal forest. Where is that? The middle forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked Betsy thoughtfully. I don't believe Anne knew any magic, or she'd have worked it before. I do not know, confess Shaggy. True, agreed Calico. Calico went to the big gong and pounded on it, just as Virgato used to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the throne, wearing Virgados discarded Ruby Crown and holding in his hand to scepter, which Virgato had so often thrown at his head. A man said to the universe, Sir, I exist. Sweat-covered Breon's body trickling into the tight-lowing cloth that was the only german he wore. The cut on his chest is still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small sharp, blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were, triggered his muscles into complete relaxation. Oliya's heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, knights and the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. the twenties, he must have drawn his gun, because the intruder said quickly, but that away you're being a fool. Out, there was silence then, and still wondering, Breon was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry and victory to the stronger. Every man who entered the twenties had his own training tricks. There appeared to be an immediate association with the death trauma as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the twenties and death during the last round was, in some ways, easier than defeat. Breeding deeply, Breon's softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second started grasp and ran forward. Our role had looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our role. and sensed it and knew the fifth point was his. Then the powerful twist that's thrust to the side in and under the guard."]
-        EXPECTED_TEXT_2 = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Burkett Foster's landscapes smile at one much in the same way that Mr. Carker."]
-        EXPECTED_TEXT_3 = [" possible. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grieved doubts whether Sir Frederick Layton's work is really greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-guards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Birk at Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampooer and a Turkish bath, next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. Under general principles of art, Mr. Quilter writes with equal lucidity. Painting, he tells us, is of a different quality to mathematics and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Mix a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing upholsterer. Near the fire. any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man, and remarks was pleasing courtesy in Felicitis Grace that many faces are feeling. Only, unfortunately, his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the tupper of painting. By Harry Quilter, M.A. Because he was sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, all poor ashaggy sits there, accoing dove. He has gone and gone for good, answered Polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled and disgraced, and your friends are asking for you. I begged Ruggadot a long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, St. Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now, inquired Shaggy. In the metal forest. Where is that? The middle forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked Betsy thoughtfully. I don't believe Anne knew any magic, or she'd have worked it before. I do not know, confess Shaggy. True, agreed Calico. Calico went to the big gong and pounded on it, just as Virgato used to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the throne, wearing Virgados discarded Ruby Crown and holding in his hand the scepter, which Virgato had so often thrown at his head. The man said to the universe, Sir, I exist. Sweat-covered Breon's body trickling into the tight-lowing cloth that was the only german to war. The cut on his chest still dripping blood. The ache of his overstrained eyes, even to soaring arena around him with thousands of spectators, retroveilities not worth thinking about. His instant panic was followed by a small sharp, blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were triggered as muscles into complete relaxation. Oily his heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, knights and the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. the twenties, he must have drawn his gun, because the intruder said quickly, but that away you're being a fool. Out, there was silence then, and still wondering, Breon was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man, with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry and victory to the stronger. Every man who entered the twenties had his own training tricks. There appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the twenties and death during the last round was, in some ways, easier than defeat. Breeding deeply, Breon softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our role. Breon sensed it and knew the fifth point was his. the powerful twist that's rest of the side, in and under the guard."]
-        EXPECTED_TEXT_4 = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Birk at Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampoo or a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. he tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Makes the customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing upholsterer. Near the fire, any ornaments Fred brought home from India on the mantelboard. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man. And remarks was pleasing courtesy in Felicitis Grace that many faces are feeling. Only, unfortunately, his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the Tupper of painting. By Harry Quilter M.A. Because you were sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accoing dove. He has gone and gone for good, answered Polychrome, would manage to squeeze into the room beside the dragon and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled and disgraced and your friends are asking for you. I begged Ruggadot long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions, as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now? In Quared Shaggy. In the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked Betsy thoughtfully. I don't believe and knew any magic or she'd have worked it before. I do not know, confess shaggy. True, a great calico. Calico went to the big gong and pounded on it just as we're good to use to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the throne, wearing ruggedos discarded ruby crown and holding in his hand to scepter which ruggedo had so often thrown at his head. A man said to the universe, Sir, I exist. Sweat covered Breon's body, trickling into the titling cloth that was the only german he wore. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were triggered as muscles into complete relaxation. Oli's heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, The thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenties, he must have drawn his gun because the intruder said quickly, but that away you're being a fool. out, there was silence then, and still wondering, Breon was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man, with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry, and victory to the stronger. a man who entered the twenties had his own training tricks. They were appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. had died before during the 20s and death during the last round was in some ways easier than defeat. Breathing deeply, Breon's softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our rogue. Breon sensed it and knew the fifth point was his. the powerful twist that's rest of the side, in and under the guard."]
-        # fmt: on
-
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
-        one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
-        audios = []
-        audios.append(one_audio[110000:])
-        audios.append(one_audio[:800000])
-        audios.append(one_audio[80000:])
-        audios.append(one_audio[:])
-
-        decoded_single = []
-        for audio in audios:
-            inputs = processor(audio, return_tensors="ms", truncation=False, sampling_rate=16_000)
-
-            result = model.generate(**inputs, return_timestamps=True)
-            decoded_single.append(processor.batch_decode(result, skip_special_tokens=True))
-
-        inputs = processor(
-            audios,
-            return_tensors="ms",
-            truncation=False,
-            padding="longest",
-            return_attention_mask=True,
-            sampling_rate=16_000,
-        )
-
-        result = model.generate(**inputs, return_timestamps=True)
-        decoded_all = processor.batch_decode(result, skip_special_tokens=True)
-
-        # make sure single & batch is exactly the same
-        assert decoded_all[0:1] == decoded_single[0]
-        assert decoded_all[1:2] == decoded_single[1]
-        assert decoded_all[2:3] == decoded_single[2]
-        assert decoded_all[3:4] == decoded_single[3]
-
-        # exact match
-        assert decoded_all[0:1] == EXPECTED_TEXT_1
-        assert decoded_all[1:2] == EXPECTED_TEXT_2
-        assert decoded_all[2:3] == EXPECTED_TEXT_3
-        assert decoded_all[3:4] == EXPECTED_TEXT_4
-
-    @slow
-    def test_whisper_longform_multi_batch_prev_cond(self):
-        # fmt: off
-        EXPECTED_TEXT_1 = [" Mr. Quilters manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of Rocky Ithaca. The Nils, pictures are sort of upguards and atom paintings and Mason's exquisite itals are as national as a jingo poem. Mr. Berkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampooer and a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate and expression. On the general principles of art, Mr. Quilters writes with equal lucidity. Painting he tells us is of a different quality to mathematics and finish in art is adding more effect. As for etchings, there are of two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing apostorer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin, for not recognizing that a picture should denote the frailty of man. And remarks with pleasing courtesy and solicitous grace that many phases of feeling only, unfortunately, his own work never does get good. Mr. Quilters has missed his chance, for he has failed even to make himself the tougher of painting. My hair equal to MA. Because he was sleeping instead of conquering, the lovely rose princess has become a fiddle with a bow, while poor shaggy sits there, a cooling dove. He has gone and gone for good, answered polychrome, who had managed to squeeze into the room beside the dragon and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled in disgrace in your friends, they are asking for you. I begged Ruggedo long ago to send him away, but he would not do so. I also offered to help you brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard since shaggy. He doesn't work at all. In fact, there is nothing he can do in these dominions as well as our nooms, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now in Quarage Shaggy? In the metal forest. Where is that? The metal forest is in the great domed cavern. The largest and all our dominions replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny remarked but see you thoughtfully. I don't believe Anne knew any magic or she'd have worked it before. I do not know, confessed Shaggy. True, agreed Calico. Calico went to the big gong and pounded on it just as we're good to use to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gong and then sat in the throne, wearing reggos, discarded ruby crown, and holding in his hand to scepter which reggado had so often thrown at his head. The man said to the universe, Sir, I exist. Sweat covered Brianna's body trickling into the tight-wing cloth that was the only garment he wore. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retrievalidies not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute of voice said, and the time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzer's were triggered as muscles into complete relaxation. Only his heart and lungs worked on at a strong, measured rate. He was in reverie sliding out on the borders of consciousness. The contestants in the twenties needed undisturbed rest. Therefore, knights and the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. But at the end of the 20s, he must have drawn his gun because the intruder said quickly, but that away, he'd be no fool. Out, the resoundance then, and still wondering, Brienne was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man, with an apparently inexhaustible story of energy. There could be little art in this last and final round of fencing, just thrust and parry and victory to the stronger. Every man who entered the 20s had his own training tricks. There appeared to be an immediate association with the death trauma, as if the two were inexplicably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported, except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the 20s, and death during the last round was, in some ways, easier than defeat. Breathing deeply, Brienne's softly spoke the autahypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Her role clipped the maze at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how closely both were to exhaustion. Brienne saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from her role. Brienne sensed it and knew the fifth point was his. In the powerful twist that's first to decide. In and under the guard."]
-        EXPECTED_TEXT_2 = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins' work is really Greek after all, and can discover in it but little of rocky Ithaca. Lennials, pictures are a sort of upguards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Berkett Foster's landscapes smile at one much in the same way that Mr. Carker"]
-        EXPECTED_TEXT_3 = [" gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating in its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins work is really Greek after all and can discover in it but little of rocky ithaka. Lennils, pictures, are a sort of upguards and atom paintings and Mason's exquisite itals are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampooer and a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate and expression. Under general principles of art, Mr. Quilter writes with equal lucidity. Painting he tells us is of a different quality to mathematics and finish in art is adding more effect. As for etchings, thereof two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing apostoror. Near the fire, any ornaments spread brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man. And remarks with pleasing courtesy and solicitous grace that many faces are feeling, only unfortunately his own work never does get good. Mr. Quilter has missed his chance. For he has failed even to make himself the tougher of painting by Harry Quilter MA. Because he was sleeping instead of conquering, the lovely Rus princess has become a fiddle with a bow while poor shaggy sits there, a cooling dove. He has gone and gone for good. Answered polychrome, who had managed to squeeze into the room beside the dragon and had witnessed the occurrences with much interest. I have remained the prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled in disgrace in your friends, they are asking for you. I begged Ruggedo long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, such a shaggy. He doesn't work at all. In fact, there is nothing he can do in these dominions as well as our nooms, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now, inquired Shaggy, in the metal forest? Where is that? The metal forest is in the great domed cavern, the largest and all our dominions replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked a bedsy thoughtfully. I don't believe Anne knew any magic or she'd have worked before. I do not know, confessed Shaggy. True, agreed Calico. Calico went to the big gong and pounded on it just as Ruggedo used to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gong and then sat in the throne, wearing Ruggedo's discarded ruby crown and holding in his hand the scepter which Ruggedo had so often thrown at his head. A man said to the universe, Sir, I exist. Sweat covered Breon's body, trickling into the tight-wing cloth that was the only garment he wore. The cut on his chest still dripping blood. The ache of his overstrain dyes, even the soaring arena around him with thousands of spectators, retrievalidates not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time and his body needed every fraction of it. The buzzer's were triggered as muscles into complete relaxation. Only his heart and lungs worked on at a strong, measured rate. He was in reverie sliding out on the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, knights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. To 20s, he must have drawn his gun because the intruder said quickly, but that away, he'd be no fool. Out, there was silence then, and still wondering, Brienne was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man, with an apparently inexhaustible story of energy. There could be little art in this last and final round of fencing, just thrust and parry and victory to the stronger. Every man who entered the 20s had his own training tricks. There appeared to be an immediate association with the death trauma as if the two were inexplicably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported, except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the 20s, and death during the last round was, in some ways, easier than defeat. Breathing deeply, Brienne softly spoke the odd hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. I rolled up the maze at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Brienne saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our old. Brienne sensed it and knew it was a fifth point was his. Then the powerful twist that's for us to decide in and under the guard."]
-        EXPECTED_TEXT_4 = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins' work is really Greek after all, and can discover in it but little of rocky Ithaca. Lennils, pictures, are a sort of upguards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Berkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says, like a shampooer in a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate and expression. On the general principles of art, Mr. Quilter writes with equal lucidity. Painting he tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, thereof two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing apostorer. Near the fire, any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin, for not recognizing that a picture should denote the frailty of man. And remarks with pleasing courtesy and solicitous grace that many phases of feeling only, unfortunately, his own work never does, get good. Mr. Quilter has missed his chance, for he has failed even to make himself the tougher of painting. My Harry Quilter, MA. Because he was sleeping instead of conquering, the lovely rose princess has become a fiddle with a bow, while poor shaggy sits there, a cooling dove. He has gone and gone for good, answered polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled in disgrace in your friends, they are asking for you. I begged Ruggedo a long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he does not work too hard, since Shaggy. He doesn't work at all. In fact, there is nothing he can do in these dominions, as well as our nooms, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico, whereas my brother now, in Quilter Shaggy, in the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all our dominions replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. That's funny, remarked a bit, see you thoughtfully. I don't believe Anne knew any magic, or she'd have worked it before. I do not know, confessed Shaggy. True, agreed Calico. Calico went to the big gong and pounded on it, just as we're good to have used to do, but no one answered the summons. Having returned to the royal cavern, Calico first pounded the gong and then sat in the throne, wearing reggos, discarded ruby crown, and holding in his hand to scepter which reggado had so often thrown at his head. A man said to the universe, Sir, I exist. Sweat covered Breon's body, trickling into the titling cloth of a zeal-neighurment he wore. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retrievalidies not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzer's were triggered as muscles into complete relaxation. Only his heart and lungs worked on at a strong, measured rate. He was in reverie, sliding out on the borders of consciousness. The contestants in the twenties needed undisturbed rest. Therefore, knights and the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see, and I'll stand aside. To twenties, he must have drawn his gun because the intruders had quickly, but that away, here being a fool. Out, there is silence then, and still wondering, Brian was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. I've read here at Mountain of a Man, with an apparently inexhaustible story of energy. There could be little art in this last and final round of fencing, just thrust and parry and victory to the stronger. Every man who entered the twenties had his own training tricks. There appeared to be an immediate association with the death trauma, as if the two were inexplicably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported, except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the twenties, and death during the last round was, in some ways, easier than defeat. Breathing deeply, Brian's softly spoke the autahypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. I rolled the maze at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Brian saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from Irohog. Brian sensed it and knew the fifth point was his. In the powerful twist that's first to decide. In and under the guard."]
-        # fmt: on
-
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
-        one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
-        audios = []
-        audios.append(one_audio[110000:])
-        audios.append(one_audio[:800000])
-        audios.append(one_audio[80000:])
-        audios.append(one_audio[:])
-
-        gen_kwargs = {
-            "return_timestamps": True,
-            "no_speech_threshold": 0.6,
-            "temperature": 0.0,
-            "compression_ratio_threshold": 1.35,
-            "condition_on_prev_tokens": True,
-            "logprob_threshold": -1.0,
-        }
-
-        decoded_single = []
-        for audio in audios:
-            inputs = processor(audio, return_tensors="ms", truncation=False, sampling_rate=16_000)
-
-            result = model.generate(**inputs, **gen_kwargs)
-            decoded_single.append(processor.batch_decode(result, skip_special_tokens=True))
-
-        # exact match
-        assert decoded_single[0] == EXPECTED_TEXT_1
-        assert decoded_single[1] == EXPECTED_TEXT_2
-        assert decoded_single[2] == EXPECTED_TEXT_3
-        assert decoded_single[3] == EXPECTED_TEXT_4
-
-    @slow
-    def test_whisper_longform_multi_batch_hard(self):
-        # fmt: off
-        EXPECTED_TEXT = [
-            " Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on a stained kid's place mat from a defunct dennies. set up a table inside a rusty cargo container down by the Wharf and challenged toothless drifters to the godless bughouse blitz of tournament that is my segment. Meanwhile.",
-            " Folks, I spend a lot of time right over there, night after night after night, actually. Carefully selecting for you the day's noosiest, most aerodynamic headlines, stress testing, and those topical anti-lock breaks and power steering, painstakingly stitching, leather seating so soft, it would make JD power and her associates blush to create the luxury sedan that is my nightly monologue. But sometimes, you sometimes, folks. I lurched a consciousness in the back of an abandoned school and slap myself awake with a crusty floor mat. Before using a mouse-bitten timing belt to strap some old plywood to a couple of discarded oil drums, then by the light of a heathen moon, render a gas tank out of an empty big gulp, fill with white claw and denatured alcohol, then light a match and let her rip and the demented one man soapbox derby of news that is my segment. Me, Guadalupe! No!",
-            " Ladies and gentlemen, you know, I spent a lot of time right over there Raising the finest Holstein news cattle firmly yet tenderly milking the latest headlines from their jokes swollen teats Churning the daily stories into the decadent proven-style style triple cream breed that is my nightly monologue But sometimes sometimes folks I stagger home hungry after being released by the police and Root around in the neighbor's trash can for an old milk carton scrape out the blooming dairy residue into the remains of a wet cheese rod I won from a rat in a pre-donned street fight. Put it in a discarded paint can to leave it to ferment next to a trash fire then hunker down and hallucinate while eating the listeria laden demon custard of news that is my segment. You mean one of them.",
-            " Folks, if you watch this show, you know I spend most of my time right over there carefully sorting through the day's biggest stories and selecting only the most subtle and unblemished ostrich and crocodile news leather, which I then entrust to artisan graduates of the Ichol Gregoire Ferrandi, who carefully dye them in a palette of bright zesty shades and adorn them in the finest and most topical inlay work using hand tools and double magnifying glasses, then assemble them according to now classic and elegant geometry using our signature saddles stitching. In line it with bees, wax, coated linen, finely attached a mallet, hammered strap, pearled hardware, and close-shit to create for you the one-of-a-kind hoke couture, Erme's Birkin bag that is my monologue. But sometimes, sometimes folks, sometimes. Sometimes I wake up in the last car of an abandoned roller coaster at Coney Island where I'm I'm hiding from the triads. I have some engine lubricants out of a safe way bag and stagger down the shore to tear the sail off a beach schooner. Then I rip the coaxial cable out of an RV and elderly couple from Utah, Hank, and Mabel lovely folks. And use it to stitch the sail into a loose pouch like a rock sack. And I stow away in the back of a garbage truck to the junkyard where I pick through to the debris for only the broken toys that make me the saddest until I have loaded for you. The Hobo Fugitives bug out, bindle of news that is my segment. Me one!",
-            " You know, folks, I spent a lot of time crafting for you a bespoke playlist of the day's biggest stories right over there. Meticulously selecting the most topical chakra affirming scented candles, and using Feng Shui to perfectly align the joke energy in the exclusive boutique yoga retreat that is my monologue. But sometimes just sometimes I go to the dumpster behind the waffle house at three in the morning, take off my shirt, cover myself, and used fry oil, wrap my hands with some double-duct tape by stole from the broken car window. Pound a six-pack of blueberry hard-seltzer and a sack of pills I stole from a parked ambulance. Then arm wrestle a raccoon in the back alley vision quest of news that is my segment. Meanwhile!",
-            " You know, folks, I spend most of my time right over there. Mining the day's biggest, most important stories, collecting the finest, most topical iron or hand hammering it into joke panels. Then I craft sheets of bronze and blazing with patterns that tell an epic tale of conquest and glory. Then, using the Germanic tradition press-black process, I place thin sheets of foil against the scenes and by hammering or otherwise applying pressure from the back, I project these scenes into a pair of cheat cards in a faceplate and, finally, using fluted strips of white alloyed molding, I divide the designs into framed panels and hold it all together using bronze rivets to create the beautiful and intimidating, Anglo-Saxon battle helm that is my nightly monologue. Sometimes, sometimes folks. Sometimes, just sometimes, I come into my sense as fully naked on the deck of a pirate besieged melee container ship that picked me up floating on the detached door of a portapotty in the Indian Ocean. Then after a sunstroke-induced realization of the crew of this ship plans to sell me an exchange for a bag of oranges to fight off scurvy, I lead a mutiny using only a PVC pipe at a pool chain that accepting my new role as Captain and declaring myself king of the windarc seas. I grab a dirty mop bucket covered in barnacles and adorn it with the teeth of the vanquished to create the sopping wet pirate crown of news that is my segment. Meanwhile!",
-            " Folks, if you watch this show, you know I spend most of my time right over there carefully blending for you the day's Newsiest most topical flower eggs milk and butter and Stranding into a fine batter to make delicate and informative comedy pancakes Then I glaze them in the juice and zest of the most relevant midnight Valencia oranges and douse it all and a fine Dela main de voyage cognac Before prom baying and basting them tables. I deserve for you the James Beard award worthy crepe suzzette That is my nightly monologue, but sometimes just sometimes folks. I wake up in the baggage hold of Greyhound bus. It's being hoisted by the scrap yard claw toward the burn pit. Escape to a nearby abandoned price chopper where I scrounge for old bread scraps and busted open bags of starfruit candies and expired eggs. Chuck it all on a dirty hubcap and slap it over a tire fire before using the legs of a strain, pair of sweatpants and as oven mitts to extract and serve the demented transience poundcake of news that is my segment. Me, Guadalupe!",
-            " Folks, if you watched the show and I hope you do, I spent a lot of time right over there. Tiredlessly studying the lineage of the days most important thoroughbred stories and whole-stiner headlines, working with the best trainers, money can buy to rear their comedy offspring with a hand that is stern yet gentle into the triple crown winning equine specimen. That is my nightly monologue, but sometimes, sometimes, folks, I break into an unincorporated veterinary genetics lab and grab whatever test tubes I can find and then under a grow light I got from a discarded chia pet. I mixed the pilfered DNA of a horse and whatever was in a tube labeled Keith Colan extra. Slurrying the concoction with caffeine pills and a microwave red bull, I screamed, sang a prayer to Janice, initiator of human life and God of transformation as a half horse, half man, freak. Seizes to life before me and the hideous collection of loose animal parts and corrupted man tissue that is my segment. Meanwhile!"
-        ]
-        # fmt: on
-
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-
-        ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
-        ds = ds.cast_column("audio", Audio(sampling_rate=16000))
-
-        num_samples = 8
-
-        audio = ds[:num_samples]["audio"]
-        audios = [x["array"] for x in audio]
-
-        decoded_single = []
-        for audio in audios:
-            inputs = processor(audio, return_tensors="ms", truncation=False, sampling_rate=16_000)
-
-            result = model.generate(**inputs, return_timestamps=True)
-            decoded_single += processor.batch_decode(result, skip_special_tokens=True)
-
-        inputs = processor(
-            audios,
-            return_tensors="ms",
-            truncation=False,
-            padding="longest",
-            return_attention_mask=True,
-            sampling_rate=16_000,
-        )
-
-        result = model.generate(**inputs, return_timestamps=True)
-        decoded_all = processor.batch_decode(result, skip_special_tokens=True)
-
-        for i in range(num_samples):
-            assert decoded_all[i] == decoded_single[i]
-            assert decoded_all[i] == EXPECTED_TEXT[i]
-
-    @slow
-    def test_whisper_longform_multi_batch_hard_prev_cond(self):
-        # Without this set here, this test may fail if it is run with other tests (say, `test_tiny_*`). It's unclear
-        # why other tests may affect this tests: it seems some random operations are beyond the scene.
-        set_seed(2345)
-        # fmt: off
-        EXPECTED_TEXT = [
-            " Folks, if you watch the show, you know I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories, developing the central headline pawns, definitely maneuvering an oh-so-topical night to F6, faming of classic Sicilian, named or variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a Fisher shows in lip-nitsky attack that culminates in the elegant lethal slow-played, all-pass on checkmate that is my nightly monologue, but sometimes sometimes folks I sometimes I start to the wake-up side down in the monkey bars of a condemned playground on a super fun site, get all hepped up on goofballs, rummage that would discard a tag bag of defective toys, yank out a fistball of disembodied doll limbs, toss them on a stain kid's place mad from a defunct denies, set up a table inside a rusty cargo container down by the warf and challenge toothless drifters to the godless bughouse blitz of tournament that is my segment, meanwhile.",
-            " Folks, I spent a lot of time right over there night after night, actually. Carefully selecting for you the day's newsiest, most aerodynamic headlines, stress testing on those topical anti-lock breaks and power steering, painstakingly stitching, leather seating, so soft, it would make JD power and her associates blush. To create the luxury sedan that is my nightly monologue, but sometimes I just sometimes focus. I lurched to consciousness in the back of an abandoned school bus and slapped myself awake with a crusty floor mat. Before using a mouse-bitten timing belt to strap some old plywood to a couple of discarded oil drums, then by the light of a heathen-moon render a gas tank out of an empty big gulp, filled with white claw and de-natured alcohol, then light a match and let her rip in the dis-mented one man, soapbox derby of news that is my segment.",
-            " Ladies and gentlemen, you know, I spent a lot of time right over there, raising the finest hosting news cattle firmly, yet tenderly milking the latest headlines from their jokes, swollen teats, churning the daily stories into the decadent Provincil style triple cream-breed. It is my nightly monologue, but sometimes sometimes I stagger home hungry after being released by the police and root around in the neighbor's trash can for an old milk carton scrape out the blooming dairy residue into the remains of a wet cheese rod I won from a rat in a pre-drawn street fight. Put it in a discarded paint can to leave it to ferment next to a trash fire than a hunker down in hallucinate while eating the Listeria latent demon custard of news that is my segment.",
-            " Folks, you watched this show, you know I spend most of my time right over there, carefully sorting through the days, big stories, and selecting only the most subtle, and unblemished ostrich and crocodile news leather, which I then entrust to artisan graduates of the Ickel Greg Waferandi, who carefully died them in a pallet of bright, zesty shades, and adorn them in the finest most topical inlay work, using hand tools and double magnifying glasses, then assemble them according to now classic and elegant geometry using our signature saddle stitching, and line it with bees, wax, coated linen, and finally attach a mallet hammered strap, purled hardware, and close-shet to create for you the one of a kind hope kutur, Ernme, is burkin bag that is my monologue, but sometimes, sometimes folks, sometimes. Sometimes I wake up in the last car of an abandoned rollercoaster at Coney Island where I'm hiding from the triads, I have some engine lubricants out of a safe way bag and staggered down the shore to tear the sail off a beach skoener, then I ripped the coaxial cable out of an RV and elderly couple from Utah, Hank, and Mabel, lovely folks, and use it to stitch the sail into a loose pouch-like rock sack, and I stow in the back of a garbage truck to the junkyard, where I pick through to the debris for only the broken toys that make me the saddest, until I have loaded for you, the hobo fugitives bug out bindle of news that",
-            " You know, folks, I spent a lot of time crafting for you a bespoke playlist of the day's big stories right over there. meticulously selecting the most topical chakra affirming scented candles, using Feng Shui, to perfectly align the joke energy in the exclusive boutique yoga retreat that is my monologue, but sometimes just sometimes, I go to the dumpster behind the waffle house at three in the morning, take off my shirt, cover myself and use fry oil, wrap my hands and some old duct tape I stole from a broken car window, pound a six pack of blueberry hard-seller and a second pill, as I stole from a parked ambulance, then arm wrestle a raccoon in the back alley vision quest of news that is my segment.",
-            " You know, folks, I spend most of my time right over there. Mining the days, biggest, most important stories, collecting the finest, most topical iron or hand hammering it into joke panels, then I craft sheets of bronze and blazing with patterns that tell an epic tale of conquest and glory. Then, using the Germanic tradition press, black process, I place thin sheets of foil against the scenes and by hammering or otherwise applying pressure from the back, I project these scenes into a pair of cheat cards and a face plate, and finally using fluted strips of white, alloyed molding, I divide the designs into framed panels and hold it all together using bronze rivets to create the beautiful and intimidating, Anglo-Saxon battle helm that is my nightly monologue. But sometimes, sometimes, folks. Sometimes, just sometimes, I come to my senses fully naked on the deck of a pirate-be-seed, melee, container ship that picked me up floating on the detached door of a porta-potty in the Indian Ocean. Then, after a sunstroke induced realization of the crew of this ship plans to sell me an exchange for a bag of oranges to fight off scurvy, I lead a mutiny using only a PVC pipe and a pool chain that accepting my new role as captain and declaring myself King of the Windark Seas. I grab a dirty mop bucket covered in barnacles and adorn it with the teeth of the vanquished to create these shopping wet pirate crown of news that is my segment. Me wild!",
-            " Folks, if you watch this show, you know I spend most of my time right over there carefully blending for you the day's newsiest, most topical flower eggs, milk and butter. And straining into a fine batter to make delicate and informative comedy pancakes, then I glaze them in the juice and zest of the most relevant midnight valencio oranges. And doubts at all, and I find delimane de voyage cognac, before from bang and basting them tables, I deserve you the James Beard Award worthy creeps to ZET. That is my nightly monologue, but sometimes sometimes folks, I wake up in the baggage hole of Greyhound bus, it's being hoisted by the scrapyard claw toward the burn pit. Escape to a nearby abandoned price chopper where I scrounge for old bread scraps, busted up in bags of starfruit candies and expired eggs. Chuck it all on a dirty hubcap and slap it over a tire fire before using the legs of a strained pair of sweatpants and as ovenmets to extract and serve the demented transients pound cake of news that is my segment.",
-            (
-                " Folks, if you watch the show and I hope you do, I spend a lot of time right over there. Tirelessly studying the lineage of the day's most important thoroughbred stories and whole-stiner headlines, working with the best trainers money can buy to rear their comedy offspring with a hand that is stern yet gentle into the triple crown winning equine specimen that is my nightly monologue. But sometimes sometimes folks I break into an unincorporated veterinary genetics lab. And grab whatever test tubes I can find and then under a grow light I got from a discarded chia pet. I mixed the pill for DNA of a horse and whatever was in a tube labeled Keith Cohen-Extra. Slurring the concoction with caffeine pills and a microwave bread bowl, I scream sing a prayer to Janice initiator of human life and God of Transformation as a half horse, half man freak ceases to life before me and the hideous collection of loose animal parts and corrupted men tissue that is my segment. Meanwhile!",
-                " Folks, if you watch the show and I hope you do, I spend a lot of time right over there. Tirelessly studying the lineage of the day's most important thoroughbred stories and whole-stiner headlines, working with the best trainers money can buy to rear their comedy offspring with a hand that is stern yet gentle into the triple crown winning equine specimen that is my nightly monologue. But sometimes sometimes folks I break into an unincorporated veterinary genetics lab. And grab whatever test tubes I can find and then under a grow light I got from a discarded chia pet. I mixed the pill for DNA of a horse and whatever was in a tube labeled Keith Cohen-Extra. Slurring the concoction with caffeine pills and a microwave bread bowl, I screamed sing a prayer to Janice initiator of human life and God of Transformation as a half horse, half man freak ceases to life before me and the hideous collection of loose animal parts and corrupted men tissue that is my segment. Meanwhile!",
-            )
-        ]
-        # fmt: on
-
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-
-        ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
-        ds = ds.cast_column("audio", Audio(sampling_rate=16000))
-
-        num_samples = 8
-
-        audio = ds[:num_samples]["audio"]
-        audios = [x["array"] for x in audio]
-
-        inputs = processor(
-            audios,
-            return_tensors="ms",
-            truncation=False,
-            padding="longest",
-            return_attention_mask=True,
-            sampling_rate=16_000,
-        )
-
-        gen_kwargs = {
-            "return_timestamps": True,
-            "no_speech_threshold": 0.6,
-            "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
-            "compression_ratio_threshold": 1.35,
-            "condition_on_prev_tokens": True,
-            "logprob_threshold": -1.0,
-            "num_beams": 5,
-        }
-
-        result = model.generate(**inputs, **gen_kwargs)
-        decoded_all = processor.batch_decode(result, skip_special_tokens=True)
-
-        for i in range(num_samples):
-            if isinstance(EXPECTED_TEXT[i], str):
-                assert decoded_all[i] == EXPECTED_TEXT[i]
-            elif isinstance(EXPECTED_TEXT[i], tuple):
-                assert decoded_all[i] in EXPECTED_TEXT[i]
-
-    @slow
-    def test_whisper_shortform_multi_batch_hard_prev_cond(self):
-        # Without this set here, this test may fail if it is run with other tests (say, `test_tiny_*`). It's unclear
-        # why other tests may affect this tests: it seems some random operations are beyond the scene.
-        set_seed(2345)
-        # fmt: off
-        EXPECTED_TEXT = [
-            ' Mr. Kfilter is the apostle of the Middle Classes and we are glad to welcome his gospel.',
-            " Nor is Mr. Qilter's manner less interesting than his matter.",
-            ' He tells us that at this festive season of the year, with Christmas and roce beef, looming before us, similarly drawn from eating and its results occur most readily to the mind.',
-            ' He has grabbed those with her surfered trigger late and his work is really a great after all, and can discover it in it but little of Rocky Ithaka.',
-            " L'Neile's pictures are a sort of upguards and add-um paintings, and Maessin's exquisite Itals are a national as a jingo poem. Mr. Birkett Foster's landscapes smiled at one much in the same way that Mr. Carcher used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slapper in the back, before he says,",
-            ' It is obviously unnecessary for us, to point out how luminous these criticisms are, how delicate and expression.',
-            ' On the general principles of art and Mr. Kriltor rights with equal lucidity.',
-            ' Painting, he tells us is of a different quality to mathematics and finish in art is adding more effect.',
-        ]
-        # fmt: on
-
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        num_samples = 8
-
-        audio = ds[:num_samples]["audio"]
-        audios = [x["array"] for x in audio]
-
-        inputs = processor(
-            audios,
-            return_tensors="ms",
-            sampling_rate=16_000,
-        )
-
-        gen_kwargs = {
-            "return_timestamps": True,
-            "no_speech_threshold": 0.6,
-            "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
-            "compression_ratio_threshold": 1.35,
-            "condition_on_prev_tokens": True,
-            "logprob_threshold": -1.0,
-        }
-
-        result = model.generate(**inputs, **gen_kwargs)
-        decoded_all = processor.batch_decode(result.sequences, skip_special_tokens=True)
-
-        for i in range(num_samples):
-            if isinstance(EXPECTED_TEXT[i], str):
-                assert decoded_all[i] == EXPECTED_TEXT[i]
-
-    @slow
-    def test_whisper_longform_no_speech_detection(self):
-        # fmt: off
-        EXPECTED_TEXT = [
-            " Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories. Developing the central headline pawns, definitely maneuvering and also topical night to F6.",
-            " Folks, I spent a lot of time right over there night after night, actually. Carefully selecting for you the day's newsiest, most aerodynamic headlines, stress testing",
-            ' Ladies and gentlemen, you know, I spent a lot of time right over there raising the finest Holstein news cattle firmly yet tenderly milking the latest headlines from their joke swollen teats',
-            ' Folks, you watched this show, you know I spend most of my time right over there, carefully sorting through the days, big stories, and selecting only the most subtle and unblemished ostrich and crocodile news leather, which I then entrust to artisan graduates of the',
-            " You know, folks, I spent a lot of time crafting for you a bespoke playlist of the day's big stories right over there. meticulously selecting the most topical chakra affirming scented candles, using Feng Shui,",
-            ' You know, folks, I spend most of my time right over there. Mining the days, biggest, most important stories, collecting the finest, most topical iron or hand hammering it into joke panels, then I craft sheets of bronze and blazing with patterns that tell an epic tale of conquest.',
-            " Folks, if you watch this show, you know I spend most of my time right over there, carefully blending for you the day's newsiest, most topical flower eggs, milk and butter. And straining into a fine batter to make delicate and informative comedy pancakes, then I glaze them in the juice and zest of the most...",
-            " Folks, if you watch the show and I hope you do, I spent a lot of time right over there. Tirelessly studying the lineage of the day's most important thoroughbred stories and whole-stiner headlines.",
-        ]
-        # fmt: on
-
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-
-        ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
-        ds = ds.cast_column("audio", Audio(sampling_rate=16000))
-
-        num_samples = 8
-
-        audio = ds[:num_samples]["audio"]
-        audios = [x["array"] for x in audio]
-
-        # Make sure the second chunk is silent
-        for audio in audios:
-            audio[15 * 16000 : 60 * 16000] = 0.0
-
-        inputs = processor(
-            audios,
-            return_tensors="ms",
-            truncation=False,
-            padding="longest",
-            return_attention_mask=True,
-            sampling_rate=16_000,
-        )
-
-        gen_kwargs = {
-            "return_timestamps": True,
-            "no_speech_threshold": 0.2,
-            "temperature": (0.0,),
-            "compression_ratio_threshold": 1.35,
-            "condition_on_prev_tokens": True,
-            "logprob_threshold": 0.0,  # Ignore logprob, use only no-speech prob
-            "num_beams": 5,
-        }
-
-        set_seed(2345)
-        result = model.generate(**inputs, **gen_kwargs)
-        decoded_all = processor.batch_decode(result, skip_special_tokens=True)
-
-        for i in range(num_samples):
-            assert decoded_all[i] == EXPECTED_TEXT[i]
-
-    @require_mindspore_gpu
-    @slow
-    def test_whisper_empty_longform(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-
-        ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
-        ds = ds.cast_column("audio", Audio(sampling_rate=16000))
-
-        num_samples = 8
-
-        audio = ds[:num_samples]["audio"]
-        audios = [x["array"] for x in audio]
-        audios[0][:] = np.zeros(audios[0].shape)
-
-        inputs = processor(
-            audios,
-            return_tensors="ms",
-            truncation=False,
-            padding="longest",
-            return_attention_mask=True,
-            sampling_rate=16_000,
-        )
-
-        gen_kwargs = {
-            "no_speech_threshold": 0.2,
-            "temperature": (0.0,),
-            "logprob_threshold": 0.0,  # Ignore logprob, use only no-speech prob
-            "num_beams": 5,
-            "language": "fr",
-            "task": "transcribe",
-        }
-
-        set_seed(2345)
-        model.generate(**inputs, **gen_kwargs)
-
-    @slow
-    def test_tiny_static_generation(self):
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-
-        input_speech = self._load_datasamples(4)
-        input_features = processor(input_speech, return_tensors="ms", sampling_rate=16_000).input_features
-        eager_generated_ids = model.generate(input_features, max_new_tokens=64)
-
-        model.generation_config.cache_implementation = "static"
-        # model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
-
-        # compile the forward pass and assert equivalence
-        static_generated_ids = model.generate(input_features, max_new_tokens=64)
-        assert (eager_generated_ids == static_generated_ids).all()
-
-        # check the compiled graph can be re-used and that the cache is correctly reset
-        # reverse the ordering of the input features
-        permutation_idx = (
-            ops.arange(input_features.shape[0], 0, step=-1, dtype=mindspore.int64) - 1
-        )
-        input_features = input_features[permutation_idx, ...]
-        static_generated_ids = model.generate(input_features, max_new_tokens=64)
-        # assert re-ordered generations match those from eager
-        assert (eager_generated_ids[permutation_idx, :] == static_generated_ids).all()
-
-    @slow
-    def test_tiny_static_generation_long_form(self):
-
-        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-
-        dataset = load_dataset("distil-whisper/meanwhile", "default")["test"]
-        dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
-        input_speech = [audio["array"] for audio in dataset[2:4]["audio"]]
-
-        inputs = processor(
-            input_speech,
-            return_tensors="ms",
-            padding="longest",
-            truncation=False,
-            return_attention_mask=True,
-            sampling_rate=16_000,
-        )
-
-        gen_kwargs = {
-            "return_timestamps": True,
-            "no_speech_threshold": 0.6,
-            "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
-            "compression_ratio_threshold": 1.35,
-            "condition_on_prev_tokens": True,  # conditioning on prev tokens introduces a recompile on the second time step
-            "logprob_threshold": -1.0,
-            "num_beams": 1,
-        }
-
-        set_seed(42)
-        eager_generated_ids = model.generate(**inputs, **gen_kwargs)
-
-        # compile the forward pass and assert equivalence
-        model.generation_config.cache_implementation = "static"
-        # model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
-
-        set_seed(42)
-        static_generated_ids = model.generate(**inputs, **gen_kwargs)
-        assert (eager_generated_ids == static_generated_ids).all()
-
-        # check the compiled graph can be re-used and that the cache is correctly reset
-        # reverse the ordering of the input features
-        input_features = inputs.input_features
-        permutation_idx = (
-            ops.arange(input_features.shape[0], 0, step=-1, dtype=mindspore.int64) - 1
-        )
-        input_features = input_features[permutation_idx, ...]
-        attention_mask = inputs.attention_mask[permutation_idx, ...]
-
-        set_seed(42)
-        static_generated_ids = model.generate(input_features, attention_mask=attention_mask, **gen_kwargs)
-        # assert re-ordered generations match those from eager
-        assert (eager_generated_ids[permutation_idx, :] == static_generated_ids).all()
-
-
-def prepare_whisper_encoder_inputs_dict(config, input_features, head_mask=None):
-    if head_mask is None:
-        head_mask = ops.ones(config.encoder_layers, config.encoder_attention_heads)
-    return {"input_features": input_features, "head_mask": head_mask}
-
-
-@require_mindspore
-class WhisperEncoderModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,  # need batch_size != num_hidden layers
-        seq_length=60,
-        is_training=True,
-        use_labels=True,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        input_channels=1,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        max_source_positions=30,
-        num_mel_bins=80,
-        num_conv_layers=1,
-        suppress_tokens=None,
-        begin_suppress_tokens=None,
-        classifier_proj_size=4,
-        num_labels=2,
-        is_encoder_decoder=False,
-        is_decoder=False,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.input_channels = input_channels
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_mel_bins = num_mel_bins
-        self.max_position_embeddings = max_position_embeddings
-        self.max_source_positions = max_source_positions
-        self.num_conv_layers = num_conv_layers
-        self.suppress_tokens = suppress_tokens
-        self.begin_suppress_tokens = begin_suppress_tokens
-        self.classifier_proj_size = classifier_proj_size
-        self.num_labels = num_labels
-        self.is_encoder_decoder = is_encoder_decoder
-        self.is_decoder = is_decoder
-
-    def get_config(self):
-        return WhisperConfig(
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            input_channels=self.input_channels,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            max_source_positions=self.max_source_positions,
-            decoder_ffn_dim=self.hidden_size,
-            encoder_ffn_dim=self.hidden_size,
-            suppress_tokens=self.suppress_tokens,
-            begin_suppress_tokens=self.begin_suppress_tokens,
-            classifier_proj_size=self.classifier_proj_size,
-            num_labels=self.num_labels,
-            is_encoder_decoder=self.is_encoder_decoder,
-            is_decoder=self.is_decoder,
-        )
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length])
-
-        config = self.get_config()
-        inputs_dict = prepare_whisper_encoder_inputs_dict(
-            config,
-            input_features=input_features,
-        )
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-        return config, inputs_dict
-
-    def get_subsampled_output_lengths(self, input_lengths):
-        """
-        Computes the output length of the convolutional layers
-        """
-
-        for i in range(self.num_conv_layers):
-            input_lengths = (input_lengths - 1) // 2 + 1
-
-        return input_lengths
-
-    @property
-    def encoder_seq_length(self):
-        return self.get_subsampled_output_lengths(self.seq_length)
-
-    def create_and_check_model_forward(self, config, inputs_dict, use_weighted_layer_sum=False):
-        config.use_weighted_layer_sum = use_weighted_layer_sum
-        model = WhisperForAudioClassification(config=config)
-        model.eval()
-
-        input_features = inputs_dict["input_features"]
-
-        with no_grad():
-            last_hidden_state = model(input_features).logits
-
-        self.parent.assertTrue(last_hidden_state.shape, (13, 2))
-
-
-@require_mindspore
-class WhisperEncoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (WhisperForAudioClassification,) if is_mindspore_available() else ()
-    is_encoder_decoder = False
-    fx_compatible = False
-    test_pruning = False
-    test_missing_keys = False
-
-    input_name = "input_features"
-
-    def setUp(self):
-        self.model_tester = WhisperEncoderModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=WhisperConfig)
-        self.maxDiff = 3000
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["input_features", "head_mask", "encoder_outputs"]
-            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
-
-    def test_forward_pass(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_forward(*config_and_inputs)
-
-    def test_forward_pass_weighted_layer_sum(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model_forward(*config_and_inputs, use_weighted_layer_sum=True)
-
-    @unittest.skip(reason="Some undefined behavior encountered with tiny versions of this model. Skip for now.")
-    def test_cpu_offload(self):
-        pass
-
-    @unittest.skip(reason="Some undefined behavior encountered with tiny versions of this model. Skip for now.")
-    def test_disk_offload_bin(self):
-        pass
-
-    @unittest.skip(reason="Some undefined behavior encountered with tiny versions of this model. Skip for now.")
-    def test_disk_offload_safetensors(self):
-        pass
-
-    @unittest.skip(reason="Some undefined behavior encountered with tiny versions of this model. Skip for now.")
-    def test_model_parallelism(self):
-        pass
-
-    @unittest.skip(reason="Not applicable for an encoder-only acoustic model")
-    def test_inputs_embeds(self):
-        # input embeds is meaningless for an encoder-only acoustic model
-        pass
-
-    # the equivalent test is passing the encoder outputs directly to the model
-    def test_encoder_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            with no_grad():
-                outputs = model(**inputs)[0]
-
-            encoder = model.encoder
-
-            encoder_inputs = {"input_features": inputs["input_features"]}
-            del inputs["input_features"]
-
-            if "head_mask" in inputs:
-                encoder_inputs["head_mask"] = inputs["head_mask"]
-            if "attention_mask" in inputs:
-                encoder_inputs["attention_mask"] = inputs["attention_mask"]
-            if "output_attentions" in inputs:
-                encoder_inputs["output_attentions"] = inputs["output_attentions"]
-
-            with no_grad():
-                inputs["encoder_outputs"] = encoder(**encoder_inputs)
-                outputs_embeds = model(**inputs)[0]
-
-            self.assertTrue((outputs_embeds == outputs).all())
-
-    # Needs to override as the encoder input embedding is a Conv1d
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Conv1d))
-            model.set_input_embeddings(nn.Conv1d(10, 10, 3))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Conv1d))
-
-    # WhisperEncoder cannot resize token embeddings since it has no tokens embeddings
-    @unittest.skip(reason="Model has no tokens embeds")
-    def test_resize_tokens_embeddings(self):
-        pass
-
-class WhisperStandaloneDecoderModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,  # need batch_size != num_hidden layers
-        is_training=True,
-        use_labels=False,
-        vocab_size=200,
-        hidden_size=16,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        input_channels=1,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
-        max_source_positions=30,
-        max_target_positions=40,
-        bos_token_id=98,
-        eos_token_id=98,
-        pad_token_id=0,
-        num_mel_bins=80,
-        decoder_start_token_id=85,
-        num_conv_layers=1,
-        suppress_tokens=None,
-        begin_suppress_tokens=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.input_channels = input_channels
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_mel_bins = num_mel_bins
-        self.max_position_embeddings = max_position_embeddings
-        self.max_source_positions = max_source_positions
-        self.max_target_positions = max_target_positions
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.bos_token_id = bos_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        self.num_conv_layers = num_conv_layers
-        self.suppress_tokens = suppress_tokens
-        self.begin_suppress_tokens = begin_suppress_tokens
-
-    def prepare_config_and_inputs(self):
-        input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
-
-        decoder_input_ids = mindspore.tensor(
-            self.batch_size * [[self.decoder_start_token_id, 3, 3, 7, 2]]
-        )
-
-        config = self.get_config()
-        config.is_encoder_decoder = False
-        inputs_dict = prepare_whisper_inputs_dict(
-            config,
-            attention_mask=None,
-            input_features=input_features,
-            decoder_input_ids=decoder_input_ids,
-        )
-
-        inputs_dict.pop("input_features")
-        inputs_dict.pop("head_mask")
-        inputs_dict.pop("decoder_head_mask")
-        inputs_dict.pop("cross_attn_head_mask")
-
-        inputs_dict["attention_mask"] = inputs_dict.pop("decoder_attention_mask")
-        inputs_dict["input_ids"] = inputs_dict.pop("decoder_input_ids")
-        return config, inputs_dict
-
-    @property
-    def encoder_seq_length(self):
-        return 5
-
-    @property
-    def seq_length(self):
-        return 5
-
-    def get_config(self):
-        return WhisperConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            input_channels=self.input_channels,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            max_source_positions=self.max_source_positions,
-            max_target_positions=self.max_target_positions,
-            eos_token_id=self.eos_token_id,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            decoder_ffn_dim=self.hidden_size,
-            encoder_ffn_dim=self.hidden_size,
-            decoder_start_token_id=self.decoder_start_token_id,
-            suppress_tokens=self.suppress_tokens,
-            begin_suppress_tokens=self.begin_suppress_tokens,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config, inputs_dict = self.prepare_config_and_inputs()
-
-        inputs_dict["input_ids"][:, -1] = self.pad_token_id
-
-        return config, inputs_dict
-
-    def prepare_config_and_inputs_for_decoder(self):
-        config, input_features = self.prepare_config_and_inputs()
-        input_ids = input_features["input_ids"]
-        encoder_hidden_states = floats_tensor([self.batch_size, self.decoder_seq_length, self.hidden_size])
-
-        return (config, input_ids, encoder_hidden_states)
-
-    def create_and_check_decoder_model_past(self, config, input_ids):
-        config.use_cache = True
-        model = WhisperDecoder(config=config).eval()
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        past_key_values = outputs["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
-
-    def create_and_check_decoder_model_attention_mask_past(self, config, input_ids):
-        model = WhisperDecoder(config=config).eval()
-
-        # create attention mask
-        attn_mask = ops.ones(input_ids.shape, dtype=mindspore.int64)
-
-        half_seq_length = input_ids.shape[-1] // 2
-        attn_mask[:, half_seq_length:] = 0
-
-        # first forward pass
-        past_key_values = model(input_ids, attention_mask=attn_mask, use_cache=True)["past_key_values"]
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # change a random masked slice from input_ids
-        random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-        random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-        # append to next input_ids and attn_mask
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        attn_mask = ops.cat(
-            [attn_mask, ops.ones((attn_mask.shape[0], 1), dtype=mindspore.int64)],
-            dim=1,
-        )
-
-        # get two different outputs
-        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        assert ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
-
-
-@require_mindspore
-class WhisperStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (WhisperDecoder, WhisperForCausalLM) if is_mindspore_available() else ()
-    all_generative_model_classes = (WhisperForCausalLM,) if is_mindspore_available() else ()
-    fx_comptatible = False
-    test_pruning = False
-    is_encoder_decoder = False
-    test_missing_keys = False
-
-    def setUp(self):
-        self.model_tester = WhisperStandaloneDecoderModelTester(self, is_training=False)
-        self.config_tester = ConfigTester(self, config_class=WhisperConfig)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_decoder_model_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config, inputs_dict = config_and_inputs
-
-        self.model_tester.create_and_check_decoder_model_past(config=config, input_ids=inputs_dict["input_ids"])
-
-    def test_decoder_model_attn_mask_past(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        config, inputs_dict = config_and_inputs
-
-        self.model_tester.create_and_check_decoder_model_attention_mask_past(
-            config=config, input_ids=inputs_dict["input_ids"]
-        )
-
-    @unittest.skip(reason="Tested implicitly through the encoder-decoder tests")
-    def test_custom_4d_attention_mask(self):
-        pass
-
-    @unittest.skip(reason="Generate needs input ids")
-    def test_generate_without_input_ids(self):
-        # generate only works with input ids for whisper
-        pass
-
-    @unittest.skip(reason="Generate needs input ids")
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        # generate only works with input ids for whisper
-        pass
-
-    @unittest.skip(reason="Decoder can't keep attention grads")
-    def test_retain_grad_hidden_states_attentions(self):
-        return
-
-    @unittest.skip(reason="The model doesn't support fast init from base")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(
-        reason="FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
-    )
-    def test_flash_attn_2_generate_reuse_cache(self):
-        pass
-
-    @unittest.skip(
-        "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
-    )
-    def test_flash_attn_2_generate_padding_right(self):
-        pass
-
-    @unittest.skip(
-        "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
-    )
-    def test_flash_attn_2_inference(self):
-        pass
-
-    @unittest.skip(
-        "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
-    )
-    def test_flash_attn_2_inference_padding_right(self):
-        pass
\ No newline at end of file
diff --git a/tests/transformers/models/x_clip/__init__.py b/tests/transformers/models/x_clip/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/x_clip/test_modeling_x_clip.py b/tests/transformers/models/x_clip/test_modeling_x_clip.py
deleted file mode 100644
index 80d653874..000000000
--- a/tests/transformers/models/x_clip/test_modeling_x_clip.py
+++ /dev/null
@@ -1,712 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Testing suite for the MindSpore XCLIP model."""
-
-import inspect
-import os
-import tempfile
-import unittest
-
-import numpy as np
-from huggingface_hub import hf_hub_download
-
-from mindspore import ops
-
-from mindnlp.transformers import XCLIPConfig, XCLIPTextConfig, XCLIPVisionConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import is_mindspore_available, is_vision_available
-from mindnlp.core.serialization import safe_load_file, safe_save_file
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore as ms
-    from mindnlp.core import nn
-
-    from mindnlp.transformers import XCLIPModel, XCLIPTextModel, XCLIPVisionModel
-
-
-if is_vision_available():
-    from mindnlp.transformers import XCLIPProcessor
-
-
-class XCLIPVisionModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=8,
-        image_size=30,
-        patch_size=2,
-        num_channels=3,
-        num_frames=8,  # important; the batch size * time must be divisible by the number of frames
-        is_training=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        mit_hidden_size=64,
-        dropout=0.1,
-        attention_dropout=0.1,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.num_frames = num_frames
-        self.is_training = is_training
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.mit_hidden_size = mit_hidden_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = num_patches + 1
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [self.batch_size * self.num_frames, self.num_channels,
-                self.image_size, self.image_size]
-        )
-        config = self.get_config()
-
-        return config, pixel_values
-
-    def get_config(self):
-        return XCLIPVisionConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            num_frames=self.num_frames,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            mit_hidden_size=self.mit_hidden_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, pixel_values):
-        model = XCLIPVisionModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * \
-            (image_size[0] // patch_size[0])
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size *
-                                             self.num_frames, num_patches + 1, self.hidden_size)
-        )
-        self.parent.assertEqual(
-            result.pooler_output.shape, (self.batch_size * self.num_frames, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class XCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as X-CLIP does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (XCLIPVisionModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = XCLIPVisionModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=XCLIPVisionConfig, has_text_modality=False, hidden_size=37
-        )
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    @unittest.skip(reason="X-CLIP does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="XCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="XCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/xclip-base-patch32"
-        model = XCLIPVisionModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-    def test_gradient_checkpointing_backward_compatibility(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if not model_class.supports_gradient_checkpointing:
-                continue
-            config.gradient_checkpointing = True
-            model = model_class(config)
-            self.assertTrue(model.is_gradient_checkpointing)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        # we add 1 here due to the special message token in X-CLIP's vision encoder
-        seq_len = getattr(self.model_tester, "seq_length", None) + 1
-        encoder_seq_length = getattr(
-            self.model_tester, "encoder_seq_length", seq_len)
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(
-                **self._prepare_for_class(inputs_dict, model_class))
-            self.assertEqual(len(outputs.attentions),
-                             self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(
-                **self._prepare_for_class(inputs_dict, model_class))
-            self.assertEqual(len(outputs.attentions),
-                             self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(outputs.attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads,
-                    encoder_seq_length, encoder_seq_length],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(
-                **self._prepare_for_class(inputs_dict, model_class))
-
-            self.assertEqual(out_len + 1, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions),
-                             self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads,
-                    encoder_seq_length, encoder_seq_length],
-            )
-
-    @unittest.skip('MindSpore need mpirun to launch multi-process for data_parallel')
-    def test_multi_gpu_data_parallel_forward(self):
-        pass
-
-
-class XCLIPTextModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=8,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        dropout=0.1,
-        attention_dropout=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.max_position_embeddings = max_position_embeddings
-        self.initializer_range = initializer_range
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor(
-            [self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask(
-                [self.batch_size, self.seq_length])
-
-        if input_mask is not None:
-            batch_size, seq_length = input_mask.shape
-            rnd_start_indices = np.random.randint(
-                1, seq_length - 1, size=(batch_size,))
-            for batch_idx, start_index in enumerate(rnd_start_indices):
-                input_mask[int(batch_idx), :int(start_index)] = 1
-                input_mask[int(batch_idx), int(start_index):] = 0
-
-        config = self.get_config()
-
-        return config, input_ids, input_mask
-
-    def get_config(self):
-        return XCLIPTextConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            dropout=self.dropout,
-            attention_dropout=self.attention_dropout,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-        )
-
-    def create_and_check_model(self, config, input_ids, input_mask):
-        model = XCLIPTextModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape,
-                                (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(
-            result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, input_mask = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class XCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (XCLIPTextModel,) if is_mindspore_available() else ()
-    fx_compatible = False
-    test_pruning = False
-    test_head_masking = False
-
-    def setUp(self):
-        self.model_tester = XCLIPTextModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=XCLIPTextConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_training(self):
-        pass
-
-    def test_training_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
-    )
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        pass
-
-    @unittest.skip(reason="X-CLIP does not use inputs_embeds")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="XCLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="XCLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/xclip-base-patch32"
-        model = XCLIPTextModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-class XCLIPModelTester:
-    def __init__(
-        self,
-        parent,
-        text_kwargs=None,
-        vision_kwargs=None,
-        projection_dim=64,
-        mit_hidden_size=64,
-        is_training=True,
-    ):
-        if text_kwargs is None:
-            text_kwargs = {}
-        if vision_kwargs is None:
-            vision_kwargs = {}
-
-        self.parent = parent
-        self.projection_dim = projection_dim
-        self.mit_hidden_size = mit_hidden_size
-        self.text_model_tester = XCLIPTextModelTester(parent, **text_kwargs)
-        self.vision_model_tester = XCLIPVisionModelTester(
-            parent, **vision_kwargs)
-        # need bs for batching_equivalence test
-        self.batch_size = self.text_model_tester.batch_size
-        self.is_training = is_training
-
-    def prepare_config_and_inputs(self):
-        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        vision_config, _ = self.vision_model_tester.prepare_config_and_inputs()
-        pixel_values = floats_tensor(
-            [
-                self.vision_model_tester.batch_size,
-                self.vision_model_tester.num_frames,
-                self.vision_model_tester.num_channels,
-                self.vision_model_tester.image_size,
-                self.vision_model_tester.image_size,
-            ]
-        )
-
-        config = self.get_config()
-
-        return config, input_ids, attention_mask, pixel_values
-
-    def get_config(self):
-        return XCLIPConfig.from_text_vision_configs(
-            self.text_model_tester.get_config(),
-            self.vision_model_tester.get_config(),
-            projection_dim=self.projection_dim,
-        )
-
-    def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
-        model = XCLIPModel(config).set_train(False)
-        result = model(input_ids, pixel_values, attention_mask)
-        self.parent.assertEqual(
-            result.logits_per_video.shape,
-            (self.vision_model_tester.batch_size,
-             self.text_model_tester.batch_size),
-        )
-        self.parent.assertEqual(
-            result.logits_per_text.shape,
-            (self.text_model_tester.batch_size,
-             self.vision_model_tester.batch_size),
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, input_ids, attention_mask, pixel_values = config_and_inputs
-        inputs_dict = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": pixel_values,
-            "return_loss": True,
-        }
-        return config, inputs_dict
-
-
-@require_mindspore
-class XCLIPModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (XCLIPModel,) if is_mindspore_available() else ()
-    pipeline_model_mapping = {
-        "feature-extraction": XCLIPModel} if is_mindspore_available() else {}
-    fx_compatible = False
-    test_head_masking = False
-    test_pruning = False
-    test_resize_embeddings = False
-    test_attention_outputs = False
-    test_torchscript = False
-    maxdiff = None
-
-    def setUp(self):
-        self.model_tester = XCLIPModelTester(self)
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    @unittest.skip(reason="Hidden_states is tested in individual model tests")
-    def test_hidden_states_output(self):
-        pass
-
-    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
-    def test_inputs_embeds(self):
-        pass
-
-    @unittest.skip(reason="Retain_grad is tested in individual model tests")
-    def test_retain_grad_hidden_states_attentions(self):
-        pass
-
-    @unittest.skip(reason="XCLIPModel does not have input/output embeddings")
-    def test_model_get_set_embeddings(self):
-        pass
-
-    @unittest.skip(reason="XCLIPModel does not support feedforward chunking")
-    def test_feed_forward_chunking(self):
-        pass
-
-    # override as the `logit_scale`, `prompts_generator.alpha` parameters require special treatment
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.parameters_and_names():
-                if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
-                    if name == "logit_scale":
-                        self.assertAlmostEqual(
-                            param.data.item(),
-                            np.log(1 / 0.07),
-                            delta=1e-3,
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-                    elif name == "prompts_generator.alpha":
-                        self.assertAlmostEqual(
-                            param.data.mean().item(), model.config.prompt_alpha)
-                    else:
-                        self.assertIn(
-                            ((param.data.mean() * 1e9).round() / 1e9).item(),
-                            [0.0, 1.0],
-                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                        )
-
-    def _create_and_check_torchscript(self, config, inputs_dict):
-        if not self.test_torchscript:
-            return
-
-        configs_no_init = _config_zero_init(
-            config)  # To be sure we have no Nan
-        configs_no_init.torchscript = True
-        configs_no_init.return_dict = False
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.set_train(False)
-
-            try:
-                input_ids = inputs_dict["input_ids"]
-                # X-CLIP needs pixel_values
-                pixel_values = inputs_dict["pixel_values"]
-                traced_model = ops.trace(model, (input_ids, pixel_values))
-            except RuntimeError:
-                self.fail("Couldn't trace module.")
-
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pt_file_name = os.path.join(
-                    tmp_dir_name, "traced_model.safetensors")
-
-                try:
-                    safe_save_file(traced_model, pt_file_name)
-                except Exception:
-                    self.fail("Couldn't save module.")
-
-                try:
-                    loaded_model = safe_load_file(pt_file_name)
-                except Exception:
-                    self.fail("Couldn't load module.")
-
-            model.set_train(False)
-
-            loaded_model.set_train(False)
-
-            model_state_dict = model.state_dict()
-            loaded_model_state_dict = loaded_model.state_dict()
-
-            non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
-                    non_persistent_buffers[key] = loaded_model_state_dict[key]
-
-            loaded_model_state_dict = {
-                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
-            }
-
-            self.assertEqual(set(model_state_dict.keys()),
-                             set(loaded_model_state_dict.keys()))
-
-            model_buffers = list(model.buffers())
-            for non_persistent_buffer in non_persistent_buffers.values():
-                found_buffer = False
-                for i, model_buffer in enumerate(model_buffers):
-                    if ops.equal(non_persistent_buffer, model_buffer):
-                        found_buffer = True
-                        break
-
-                self.assertTrue(found_buffer)
-                model_buffers.pop(i)
-
-            models_equal = True
-            for layer_name, p1 in model_state_dict.items():
-                p2 = loaded_model_state_dict[layer_name]
-                if p1.data.ne(p2.data).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-    def test_load_vision_text_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # Save XCLIPConfig and check if we can load XCLIPVisionConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            vision_config = XCLIPVisionConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(
-                config.vision_config.to_dict(), vision_config.to_dict())
-
-        # Save XCLIPConfig and check if we can load XCLIPTextConfig from it
-        with tempfile.TemporaryDirectory() as tmp_dir_name:
-            config.save_pretrained(tmp_dir_name)
-            text_config = XCLIPTextConfig.from_pretrained(tmp_dir_name)
-            self.assertDictEqual(
-                config.text_config.to_dict(), text_config.to_dict())
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "microsoft/xclip-base-patch32"
-        model = XCLIPModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on a spaghetti video
-def prepare_video():
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti_8_frames.npy", repo_type="dataset"
-    )
-    video = np.load(file)
-    return list(video)
-
-
-@require_vision
-@require_mindspore
-class XCLIPModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_inference(self):
-        model_name = "microsoft/xclip-base-patch32"
-        model = XCLIPModel.from_pretrained(model_name)
-        processor = XCLIPProcessor.from_pretrained(model_name)
-
-        video = prepare_video()
-        inputs = processor(
-            text=["playing sports", "eating spaghetti", "go shopping"], videos=video, return_tensors="ms", padding=True
-        )
-
-        # forward pass
-        outputs = model(**inputs)
-
-        # verify the logits
-        self.assertEqual(
-            outputs.logits_per_video.shape,
-            (inputs.pixel_values.shape[0], inputs.input_ids.shape[0]),
-        )
-        self.assertEqual(
-            outputs.logits_per_text.shape,
-            (inputs.input_ids.shape[0], inputs.pixel_values.shape[0]),
-        )
-
-        expected_logits = ms.Tensor([[14.0181, 20.2771, 14.4776]])
-
-        self.assertTrue(np.allclose(
-            outputs.logits_per_video.asnumpy(), expected_logits.asnumpy(), atol=1e-3))
diff --git a/tests/transformers/models/xlm/__init__.py b/tests/transformers/models/xlm/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/xlm/test_modeling_xlm.py b/tests/transformers/models/xlm/test_modeling_xlm.py
deleted file mode 100644
index 99a0e2031..000000000
--- a/tests/transformers/models/xlm/test_modeling_xlm.py
+++ /dev/null
@@ -1,539 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from mindnlp.transformers import XLMConfig
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        XLMForMultipleChoice,
-        XLMForQuestionAnswering,
-        XLMForQuestionAnsweringSimple,
-        XLMForSequenceClassification,
-        XLMForTokenClassification,
-        XLMModel,
-        XLMWithLMHeadModel,
-    )
-    from mindnlp.transformers.models.xlm.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
-class XLMModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_lengths=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        gelu_activation=True,
-        sinusoidal_embeddings=False,
-        causal=False,
-        asm=False,
-        n_langs=2,
-        vocab_size=99,
-        n_special=0,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=2,
-        num_choices=4,
-        summary_type="last",
-        use_proj=True,
-        scope=None,
-        bos_token_id=0,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_lengths = use_input_lengths
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.gelu_activation = gelu_activation
-        self.sinusoidal_embeddings = sinusoidal_embeddings
-        self.causal = causal
-        self.asm = asm
-        self.n_langs = n_langs
-        self.vocab_size = vocab_size
-        self.n_special = n_special
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.summary_type = summary_type
-        self.use_proj = use_proj
-        self.scope = scope
-        self.bos_token_id = bos_token_id
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        input_lengths = None
-        if self.use_input_lengths:
-            input_lengths = (
-                ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2
-            )  # small variation of seq_length
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs)
-
-        sequence_labels = None
-        token_labels = None
-        is_impossible_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            is_impossible_labels = ids_tensor([self.batch_size], 2).float()
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            choice_labels,
-            input_mask,
-        )
-
-    def get_config(self):
-        return XLMConfig(
-            vocab_size=self.vocab_size,
-            n_special=self.n_special,
-            emb_dim=self.hidden_size,
-            n_layers=self.num_hidden_layers,
-            n_heads=self.num_attention_heads,
-            dropout=self.hidden_dropout_prob,
-            attention_dropout=self.attention_probs_dropout_prob,
-            gelu_activation=self.gelu_activation,
-            sinusoidal_embeddings=self.sinusoidal_embeddings,
-            asm=self.asm,
-            causal=self.causal,
-            n_langs=self.n_langs,
-            max_position_embeddings=self.max_position_embeddings,
-            initializer_range=self.initializer_range,
-            summary_type=self.summary_type,
-            use_proj=self.use_proj,
-            num_labels=self.num_labels,
-            bos_token_id=self.bos_token_id,
-        )
-
-    def create_and_check_xlm_model(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = XLMModel(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, lengths=input_lengths, langs=token_type_ids)
-        result = model(input_ids, langs=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_xlm_lm_head(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = XLMWithLMHeadModel(config)
-
-        model.set_train(False)
-
-        result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_xlm_simple_qa(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = XLMForQuestionAnsweringSimple(config)
-
-        model.set_train(False)
-
-        outputs = model(input_ids)
-
-        outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
-        result = outputs
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_xlm_qa(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = XLMForQuestionAnswering(config)
-
-        model.set_train(False)
-
-        result = model(input_ids)
-
-        result_with_labels = model(
-            input_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-            cls_index=sequence_labels,
-            is_impossible=is_impossible_labels,
-            p_mask=input_mask,
-        )
-
-        result_with_labels = model(
-            input_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-            cls_index=sequence_labels,
-            is_impossible=is_impossible_labels,
-        )
-
-        (total_loss,) = result_with_labels.to_tuple()
-
-        result_with_labels = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels)
-
-        (total_loss,) = result_with_labels.to_tuple()
-
-        self.parent.assertEqual(result_with_labels.loss.shape, ())
-        self.parent.assertEqual(result.start_top_log_probs.shape, (self.batch_size, model.config.start_n_top))
-        self.parent.assertEqual(result.start_top_index.shape, (self.batch_size, model.config.start_n_top))
-        self.parent.assertEqual(
-            result.end_top_log_probs.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top)
-        )
-        self.parent.assertEqual(
-            result.end_top_index.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top)
-        )
-        self.parent.assertEqual(result.cls_logits.shape, (self.batch_size,))
-
-    def create_and_check_xlm_sequence_classif(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        model = XLMForSequenceClassification(config)
-
-        model.set_train(False)
-
-        result = model(input_ids)
-        result = model(input_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-    def create_and_check_xlm_token_classif(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        config.num_labels = self.num_labels
-        model = XLMForTokenClassification(config)
-
-        model.set_train(False)
-
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_xlm_for_multiple_choice(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_lengths,
-        sequence_labels,
-        token_labels,
-        is_impossible_labels,
-        choice_labels,
-        input_mask,
-    ):
-        config.num_choices = self.num_choices
-        model = XLMForMultipleChoice(config=config)
-
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_lengths,
-            sequence_labels,
-            token_labels,
-            is_impossible_labels,
-            choice_labels,
-            input_mask,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths}
-        return config, inputs_dict
-
-
-class XLMModelTest(ModelTesterMixin, GenerationTesterMixin):
-    all_model_classes = (
-        (
-            XLMWithLMHeadModel,
-            XLMForQuestionAnswering,
-            XLMForSequenceClassification,
-            XLMForQuestionAnsweringSimple,
-            XLMForTokenClassification,
-            XLMForMultipleChoice,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (XLMWithLMHeadModel,) if is_mindspore_available() else ()
-    )  # TODO (PVP): Check other models whether language generation is also applicable
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": XLMModel,
-            "fill-mask": XLMWithLMHeadModel,
-            "question-answering": XLMForQuestionAnsweringSimple,
-            "text-classification": XLMForSequenceClassification,
-            "text-generation": XLMWithLMHeadModel,
-            "token-classification": XLMForTokenClassification,
-            "zero-shot": XLMForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if (
-            pipeline_test_casse_name == "QAPipelineTests"
-            and tokenizer_name is not None
-            and not tokenizer_name.endswith("Fast")
-        ):
-            # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
-            # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
-            # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
-            return True
-
-        return False
-
-    # XLM has 2 QA models -> need to manually set the correct labels for one of them here
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "XLMForQuestionAnswering":
-                inputs_dict["start_positions"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64)
-                inputs_dict["end_positions"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64)
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = XLMModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_xlm_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_model(*config_and_inputs)
-
-    def test_xlm_lm_head(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs)
-
-    def test_xlm_simple_qa(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_simple_qa(*config_and_inputs)
-
-    def test_xlm_qa(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_qa(*config_and_inputs)
-
-    def test_xlm_sequence_classif(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
-
-    def test_xlm_token_classif(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_token_classif(*config_and_inputs)
-
-    def test_xlm_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs)
-
-    def _check_attentions_for_generate(
-        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        self.assertIsInstance(attentions, tuple)
-        self.assertListEqual(
-            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
-        )
-        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
-
-        for idx, iter_attentions in enumerate(attentions):
-            # adds PAD dummy token
-            tgt_len = min_length + idx + 1
-            src_len = min_length + idx + 1
-
-            expected_shape = (
-                batch_size * num_beam_groups,
-                config.num_attention_heads,
-                tgt_len,
-                src_len,
-            )
-            # check attn size
-            self.assertListEqual(
-                [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions)
-            )
-
-    def _check_hidden_states_for_generate(
-        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        self.assertIsInstance(hidden_states, tuple)
-        self.assertListEqual(
-            [isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states],
-            [True] * len(hidden_states),
-        )
-        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
-
-        for idx, iter_hidden_states in enumerate(hidden_states):
-            # adds PAD dummy token
-            seq_len = min_length + idx + 1
-            expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
-            # check hidden size
-            self.assertListEqual(
-                [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states],
-                [expected_shape] * len(iter_hidden_states),
-            )
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = XLMModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-
-@require_mindspore
-class XLMModelLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_xlm_mlm_en_2048(self):
-        model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-en-2048")
-
-        input_ids = mindspore.tensor([[14, 447]], dtype=mindspore.int64)  # the president
-        expected_output_ids = [
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-            14,
-            447,
-        ]  # the president the president the president the president the president the president the president the president the president the president
-        # TODO(PVP): this and other input_ids I tried for generation give pretty bad results. Not sure why. Model might just not be made for auto-regressive inference
-        output_ids = model.generate(input_ids, do_sample=False)
-        self.assertListEqual(output_ids[0].asnumpy().tolist(), expected_output_ids)
diff --git a/tests/transformers/models/xlm/test_tokenization_xlm.py b/tests/transformers/models/xlm/test_tokenization_xlm.py
deleted file mode 100644
index 395bd1897..000000000
--- a/tests/transformers/models/xlm/test_tokenization_xlm.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import unittest
-
-from mindnlp.transformers.models.xlm.tokenization_xlm import VOCAB_FILES_NAMES, XLMTokenizer
-from mindnlp.utils.testing_utils import slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    tokenizer_class = XLMTokenizer
-    test_rust_tokenizer = False
-
-    def setUp(self):
-        super().setUp()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "w</w>",
-            "r</w>",
-            "t</w>",
-            "lo",
-            "low",
-            "er</w>",
-            "low</w>",
-            "lowest</w>",
-            "newer</w>",
-            "wider</w>",
-            "<unk>",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w") as fp:
-            fp.write(json.dumps(vocab_tokens))
-        with open(self.merges_file, "w") as fp:
-            fp.write("\n".join(merges))
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        """Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt"""
-        tokenizer = XLMTokenizer(self.vocab_file, self.merges_file)
-
-        text = "lower"
-        bpe_tokens = ["low", "er</w>"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + ["<unk>"]
-        input_bpe_tokens = [14, 15, 20]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [0] + text + [1]
-        assert encoded_pair == [0] + text + [1] + text_2 + [1]
\ No newline at end of file
diff --git a/tests/transformers/models/xlm_prophetnet/__init__.py b/tests/transformers/models/xlm_prophetnet/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py b/tests/transformers/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py
deleted file mode 100644
index eff6aa04c..000000000
--- a/tests/transformers/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team, The Microsoft Research team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-import numpy as np
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import  slow
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.transformers import XLMProphetNetForConditionalGeneration, XLMProphetNetTokenizer
-
-
-class XLMProphetNetModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_pretrained_checkpoint_hidden_states(self):
-        model = XLMProphetNetForConditionalGeneration.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
-
-        # encoder-decoder outputs
-        encoder_ids = mindspore.Tensor([[17, 96208, 103471, 2]])
-        decoder_prev_ids = mindspore.Tensor(
-            [[2, 250, 9953, 34, 69489, 1620, 32, 118424, 624, 210, 105, 2913, 1032, 351]]
-        )
-        output = model(
-            input_ids=encoder_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=decoder_prev_ids
-        )
-        output_predited_logis = output[0]
-        expected_shape = (1, 14, 250012)
-        self.assertEqual(output_predited_logis.shape, expected_shape)
-        expected_slice = mindspore.Tensor(
-            [[[-6.3986, -8.2391, 12.5189], [-6.3289, -8.0864, 12.6211], [-6.2418, -8.0445, 12.7968]]]
-        )
-        #self.assertTrue(np.allclose(output_predited_logis[:, :3, :3], expected_slice, atol=1e-4))
-
-        # encoder outputs
-        encoder_outputs = model.prophetnet.encoder(encoder_ids)[0]
-        expected_encoder_outputs_slice = mindspore.Tensor(
-            [[[-1.4260, -0.7628, 0.8453], [-1.4719, -0.1391, 0.7807], [-1.7678, 0.0114, 0.4646]]]
-        )
-        expected_shape_encoder = (1, 4, 1024)
-        self.assertEqual(encoder_outputs.shape, expected_shape_encoder)
-        self.assertTrue(np.allclose(encoder_outputs[:, :3, :3].asnumpy(), expected_encoder_outputs_slice.asnumpy(), atol=1e-4))
-
-        # decoder outputs
-        decoder_outputs = model.prophetnet.decoder(
-            decoder_prev_ids,
-            encoder_hidden_states=encoder_outputs,
-        )
-        predicting_streams = decoder_outputs[1].view(1, model.config.ngram, 14, -1)
-        predicting_streams_logits = model.lm_head(predicting_streams)
-        next_first_stream_logits = predicting_streams_logits[:, 0]
-        #self.assertTrue(np.allclose(next_first_stream_logits[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-3))
-
-    @slow
-    def test_ntg_hidden_states(self):
-        model = XLMProphetNetForConditionalGeneration.from_pretrained(
-            "microsoft/xprophetnet-large-wiki100-cased-xglue-ntg"
-        )
-
-        encoder_ids = mindspore.Tensor([[17, 96208, 103471, 2]])
-        decoder_prev_ids = mindspore.Tensor(
-            [[2, 250, 9953, 34, 69489, 1620, 32, 118424, 624, 210, 105, 2913, 1032, 351]]
-        )
-        output = model(
-            input_ids=encoder_ids, attention_mask=None, encoder_outputs=None, decoder_input_ids=decoder_prev_ids
-        )
-        output_predited_logis = output[0]
-        expected_shape = (1, 14, 250012)
-        self.assertEqual(output_predited_logis.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = mindspore.Tensor(
-            [[[-6.3986, -8.2391, 12.5189], [-6.3289, -8.0864, 12.6211], [-6.2418, -8.0446, 12.7968]]]
-        )
-        #self.assertTrue(np.allclose(output_predited_logis[:, :3, :3].asnumpy(), expected_slice.asnumpy(), atol=3e-1))
-
-    @slow
-    def test_xprophetnet_ntg_inference(self):
-        model = XLMProphetNetForConditionalGeneration.from_pretrained(
-            "microsoft/xprophetnet-large-wiki100-cased-xglue-ntg"
-        )
-        model.config.max_length = 512
-
-        tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased-xglue-ntg")
-
-        EN_SENTENCE = (
-            "Microsoft Corporation intends to officially end free support for the Windows 7 operating system after"
-            " January 14, 2020, according to the official portal of the organization. From that day, users of this"
-            " system will not be able to receive security updates, which could make their computers vulnerable to"
-            " cyber attacks."
-        )
-        RU_SENTENCE = (
-            "орпорация Microsoft намерена официально прекратить бесплатную поддержку операционной системы Windows 7"
-            " после 14 января 2020 года, сообщается на официальном портале организации . С указанного дня пользователи"
-            " этой системы не смогут получать обновления безопасности, из-за чего их компьютеры могут стать уязвимыми"
-            " к кибератакам."
-        )
-        ZH_SENTENCE = "根据该组织的官方门户网站，微软公司打算在2020年1月14日之后正式终止对Windows 7操作系统的免费支持。从那时起，该系统的用户将无法接收安全更新，这可能会使他们的计算机容易受到网络攻击。"
-
-        input_ids = tokenizer(
-            [EN_SENTENCE, RU_SENTENCE, ZH_SENTENCE], padding=True, max_length=255, return_tensors="ms"
-        ).input_ids
-
-        summary_ids = model.generate(
-            input_ids, num_beams=10, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
-        )
-        generated_titles = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
-        EXPECTED_TITLE_EN = "that that"
-        EXPECTED_TITLE_RU = ",   и   в   после   —   также"
-        EXPECTED_TITLE_ZH = ",,。。"
-        self.assertListEqual(
-            [EXPECTED_TITLE_EN, EXPECTED_TITLE_RU, EXPECTED_TITLE_ZH],
-            generated_titles,
-        )
-
-        summary_ids_beam1 = model.generate(
-            input_ids, num_beams=1, length_penalty=1.0, no_repeat_ngram_size=3, early_stopping=True
-        )
-        generated_titles_beam1_tok = [
-            tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True) for g in summary_ids_beam1
-        ]
-        EXPECTED_TITLE_EN_BEAM1_TOK = []
-        EXPECTED_TITLE_RU_BEAM1_TOK = ['▁', '▁', '▁', '▁и', '▁', '▁', ',', '▁', '▁', '▁в', '▁', '▁', 'и', '▁', '▁', '▁—', '▁', '▁', '▁также', '▁', '▁']
-        EXPECTED_TITLE_ZH_BEAM1_TOK = [',', ',', '。']
-        self.assertListEqual(
-            [EXPECTED_TITLE_EN_BEAM1_TOK, EXPECTED_TITLE_RU_BEAM1_TOK, EXPECTED_TITLE_ZH_BEAM1_TOK],
-            generated_titles_beam1_tok,
-        )
diff --git a/tests/transformers/models/xlm_roberta/__init__.py b/tests/transformers/models/xlm_roberta/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/xlm_roberta/test_modeling_xlm_roberta.py b/tests/transformers/models/xlm_roberta/test_modeling_xlm_roberta.py
deleted file mode 100644
index f840b9c57..000000000
--- a/tests/transformers/models/xlm_roberta/test_modeling_xlm_roberta.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from mindnlp.transformers import is_mindspore_available
-from mindnlp.utils.testing_utils import (
-    require_sentencepiece,
-    require_tokenizers,
-    require_mindspore,
-    slow,
-)
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops, no_grad
-
-    from mindnlp.transformers import XLMRobertaModel
-
-
-@require_sentencepiece
-@require_tokenizers
-@require_mindspore
-class XLMRobertaModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_xlm_roberta_base(self):
-        model = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base", attn_implementation="eager")
-        input_ids = mindspore.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
-        # The dog is cute and lives in the garden house
-
-        expected_output_shape = (1, 12, 768) # batch_size, sequence_length, embedding_vector_dim
-        expected_output_values_last_dim = mindspore.tensor(
-            [[-0.0101, 0.1218, -0.0803, 0.0801, 0.1327, 0.0776, -0.1215, 0.2383, 0.3338, 0.3106, 0.0300, 0.0252]]
-        )
-        #  xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.base')
-        #  xlmr.eval()
-        #  expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]
-        with no_grad():
-            output = model(input_ids)["last_hidden_state"]
-        self.assertEqual(output.shape, expected_output_shape)
-        # compare the actual values for a slice of last dim
-        self.assertTrue(ops.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
-
-    @slow
-    def test_xlm_roberta_large(self):
-        model = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-large")
-        input_ids = mindspore.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
-        # The dog is cute and lives in the garden house
-
-        expected_output_shape = (1, 12, 1024)  # batch_size, sequence_length, embedding_vector_dim
-        expected_output_values_last_dim = mindspore.tensor(
-            [[-0.0699, -0.0318, 0.0705, -0.1241, 0.0999, -0.0520, 0.1004, -0.1838, -0.4704, 0.1437, 0.0821, 0.0126]]
-        )
-        #  xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large')
-        #  xlmr.eval()
-        #  expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1]
-        with no_grad():
-            output = model(input_ids)["last_hidden_state"]
-        self.assertEqual(output.shape, expected_output_shape)
-        # compare the actual values for a slice of last dim
-        self.assertTrue(ops.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3))
\ No newline at end of file
diff --git a/tests/transformers/models/xlm_roberta_xl/__init__.py b/tests/transformers/models/xlm_roberta_xl/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/transformers/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
deleted file mode 100644
index df18184d5..000000000
--- a/tests/transformers/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
+++ /dev/null
@@ -1,546 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from mindnlp.transformers import XLMRobertaXLConfig
-from mindnlp.utils.testing_utils import slow, require_mindspore, is_mindspore_available
-import numpy as np
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-    from mindnlp.transformers import (
-        XLMRobertaXLForCausalLM,
-        XLMRobertaXLForMaskedLM,
-        XLMRobertaXLForMultipleChoice,
-        XLMRobertaXLForQuestionAnswering,
-        XLMRobertaXLForSequenceClassification,
-        XLMRobertaXLForTokenClassification,
-        XLMRobertaXLModel,
-    )
-    from mindnlp.transformers.models.xlm_roberta_xl.modeling_xlm_roberta_xl import (
-        XLMRobertaXLEmbeddings,
-        create_position_ids_from_input_ids,
-    )
-
-
-class XLMRobertaXLModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return XLMRobertaXLConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = XLMRobertaXLModel(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = XLMRobertaXLModel(config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = XLMRobertaXLForCausalLM(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = XLMRobertaXLForCausalLM(config=config).set_train(False)
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], axis=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], axis=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = XLMRobertaXLForMaskedLM(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = XLMRobertaXLForTokenClassification(config=config)
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = XLMRobertaXLForMultipleChoice(config=config)
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = XLMRobertaXLForQuestionAnswering(config=config)
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class XLMRobertaXLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            XLMRobertaXLForCausalLM,
-            XLMRobertaXLForMaskedLM,
-            XLMRobertaXLForSequenceClassification,
-            XLMRobertaXLForTokenClassification,
-            XLMRobertaXLForMultipleChoice,
-            XLMRobertaXLForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (XLMRobertaXLForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": XLMRobertaXLModel,
-            "fill-mask": XLMRobertaXLForMaskedLM,
-            "question-answering": XLMRobertaXLForQuestionAnswering,
-            "text-classification": XLMRobertaXLForSequenceClassification,
-            "text-generation": XLMRobertaXLForCausalLM,
-            "token-classification": XLMRobertaXLForTokenClassification,
-            "zero-shot": XLMRobertaXLForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = XLMRobertaXLModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XLMRobertaXLConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs_relative_pos_emb(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        config_and_inputs[0].position_embedding_type = "relative_key"
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_create_position_ids_respects_padding_index(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is XLMRobertaXLEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = XLMRobertaXLEmbeddings(config=config)
-
-        input_ids = mindspore.tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = mindspore.tensor(
-            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-        )
-
-        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(ops.all(ops.eq(position_ids, expected_positions)))
-
-    def test_create_position_ids_from_inputs_embeds(self):
-        """Ensure that the default position ids only assign a sequential . This is a regression
-        test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is XLMRobertaXLEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = XLMRobertaXLEmbeddings(config=config)
-
-        inputs_embeds = mindspore.numpy.empty((2, 4, 30))
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = mindspore.tensor([expected_single_positions, expected_single_positions])
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(ops.all(ops.eq(position_ids, expected_positions)))
-
-
-@slow
-@require_mindspore
-class XLMRobertaModelXLIntegrationTest(unittest.TestCase):
-    def test_xlm_roberta_xl(self):
-        model = XLMRobertaXLModel.from_pretrained("facebook/xlm-roberta-xl",from_pt=True)
-        input_ids = mindspore.tensor(
-            [[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]]
-        )
-        # The dog is cute and lives in the garden house
-
-        expected_output_shape =(1, 12, 2560) # batch_size, sequence_length, embedding_vector_dim
-        expected_output_values_last_dim = mindspore.tensor(
-            [[0.0110, 0.0605, 0.0354, 0.0689, 0.0066, 0.0691, 0.0302, 0.0412, 0.0860, 0.0036, 0.0405, 0.0170]],
-        )
-
-        output = model(input_ids)["last_hidden_state"]
-        self.assertEqual(output.shape, expected_output_shape)
-        # compare the actual values for a slice of last dim
-        self.assertTrue(np.allclose(output[:, :, -1].asnumpy(), expected_output_values_last_dim.asnumpy(), atol=1e-3))
-
-    def test_xlm_roberta_xxl(self):
-        model = XLMRobertaXLModel.from_pretrained("facebook/xlm-roberta-xxl",from_pt=True)
-        input_ids = mindspore.tensor(
-            [[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]]
-        )
-        # The dog is cute and lives in the garden house
-
-        expected_output_shape = (1, 12, 4096)  # batch_size, sequence_length, embedding_vector_dim
-        expected_output_values_last_dim = mindspore.tensor(
-            [[0.0046, 0.0146, 0.0227, 0.0126, 0.0219, 0.0175, -0.0101, 0.0006, 0.0124, 0.0209, -0.0063, 0.0096]],
-        )
-
-        output = model(input_ids)["last_hidden_state"]
-        self.assertEqual(output.shape, expected_output_shape)
-        # compare the actual values for a slice of last dim
-        self.assertTrue(np.allclose(output[:, :, -1].asnumpy(), expected_output_values_last_dim.asnumpy(), atol=1e-3))
diff --git a/tests/transformers/models/xlnet/__init__.py b/tests/transformers/models/xlnet/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/xlnet/test_modeling_xlnet.py b/tests/transformers/models/xlnet/test_modeling_xlnet.py
deleted file mode 100644
index f1354c2c6..000000000
--- a/tests/transformers/models/xlnet/test_modeling_xlnet.py
+++ /dev/null
@@ -1,728 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-from mindnlp.transformers import XLNetConfig, is_mindspore_available
-from mindnlp.utils.testing_utils import require_mindspore, slow
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        XLNetForMultipleChoice,
-        XLNetForQuestionAnswering,
-        XLNetForQuestionAnsweringSimple,
-        XLNetForSequenceClassification,
-        XLNetForTokenClassification,
-        XLNetLMHeadModel,
-        XLNetModel,
-    )
-
-
-class XLNetModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        mem_len=10,
-        clamp_len=-1,
-        reuse_len=15,
-        is_training=True,
-        use_labels=True,
-        vocab_size=99,
-        cutoffs=[10, 50, 80],
-        hidden_size=32,
-        num_attention_heads=4,
-        d_inner=128,
-        num_hidden_layers=2,
-        type_sequence_label_size=2,
-        untie_r=True,
-        bi_data=False,
-        same_length=False,
-        initializer_range=0.05,
-        seed=1,
-        type_vocab_size=2,
-        bos_token_id=1,
-        eos_token_id=2,
-        pad_token_id=5,
-        num_choices=4,
-    ):
-        self.parent = parent
-        self.batch_size = 14
-        self.seq_length = 7
-        self.mem_len = 10
-        # self.key_len = seq_length + mem_len
-        self.clamp_len = -1
-        self.reuse_len = 15
-        self.is_training = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.cutoffs = [10, 50, 80]
-        self.hidden_size = 32
-        self.num_attention_heads = 4
-        self.d_inner = 128
-        self.num_hidden_layers = 5
-        self.type_sequence_label_size = 2
-        self.untie_r = True
-        self.bi_data = False
-        self.same_length = False
-        self.initializer_range = 0.05
-        self.seed = 1
-        self.type_vocab_size = 2
-        self.bos_token_id = 1
-        self.eos_token_id = 2
-        self.pad_token_id = 5
-        self.num_choices = 4
-
-    def prepare_config_and_inputs(self):
-        input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        segment_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-        input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
-        perm_mask = ops.zeros(
-            self.batch_size,
-            self.seq_length + 1,
-            self.seq_length + 1,
-            dtype=mindspore.float32,
-        )
-        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-        target_mapping = ops.zeros(
-            self.batch_size,
-            1,
-            self.seq_length + 1,
-            dtype=mindspore.float32,
-        )
-        target_mapping[:, 0, -1] = 1.0  # predict last token
-
-        sequence_labels = None
-        lm_labels = None
-        is_impossible_labels = None
-        token_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            is_impossible_labels = ids_tensor([self.batch_size], 2).float()
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        config = self.get_config()
-
-        return (
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-            token_labels,
-        )
-
-    def get_config(self):
-        return XLNetConfig(
-            vocab_size=self.vocab_size,
-            d_model=self.hidden_size,
-            n_head=self.num_attention_heads,
-            d_inner=self.d_inner,
-            n_layer=self.num_hidden_layers,
-            untie_r=self.untie_r,
-            mem_len=self.mem_len,
-            clamp_len=self.clamp_len,
-            same_length=self.same_length,
-            reuse_len=self.reuse_len,
-            bi_data=self.bi_data,
-            initializer_range=self.initializer_range,
-            num_labels=self.type_sequence_label_size,
-            bos_token_id=self.bos_token_id,
-            pad_token_id=self.pad_token_id,
-            eos_token_id=self.eos_token_id,
-        )
-
-    def set_seed(self):
-        random.seed(self.seed)
-        mindspore.set_seed(self.seed)
-        mindspore.manual_seed(self.seed)
-
-    def create_and_check_xlnet_base_model(
-        self,
-        config,
-        input_ids_1,
-        input_ids_2,
-        input_ids_q,
-        perm_mask,
-        input_mask,
-        target_mapping,
-        segment_ids,
-        lm_labels,
-        sequence_labels,
-        is_impossible_labels,
-        token_labels,
-    ):
-        model = XLNetModel(config)
-        model.eval()
-
-        result = model(input_ids_1, input_mask=input_mask)
-        result = model(input_ids_1, attention_mask=input_mask)
-        result = model(input_ids_1, token_type_ids=segment_ids)
-        result = model(input_ids_1)
-
-        config.mem_len = 0
-        model = XLNetModel(config)
-        model.eval()
-        base_model_output = model(input_ids_1)
-        self.parent.assertEqual(len(base_model_output), 2)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in result.mems],
-            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def create_and_check_use_mems_train(
-        self,
-        config,
-        input_ids_1,
-        input_ids_2,
-        input_ids_q,
-        perm_mask,
-        input_mask,
-        target_mapping,
-        segment_ids,
-        lm_labels,
-        sequence_labels,
-        is_impossible_labels,
-        token_labels,
-    ):
-        model = XLNetForSequenceClassification(config)
-        model.train()
-
-        train_size = input_ids_1.shape[0]
-
-        batch_size = 4
-        for i in range(train_size // batch_size + 1):
-            input_ids = input_ids_1[i : (i + 1) * batch_size]
-            labels = sequence_labels[i : (i + 1) * batch_size]
-            outputs = model(input_ids=input_ids, labels=labels, return_dict=True)
-            self.parent.assertIsNone(outputs.mems)
-            self.parent.assertIsNotNone(outputs.loss)
-
-    def create_and_check_xlnet_model_use_mems(
-        self,
-        config,
-        input_ids_1,
-        input_ids_2,
-        input_ids_q,
-        perm_mask,
-        input_mask,
-        target_mapping,
-        segment_ids,
-        lm_labels,
-        sequence_labels,
-        is_impossible_labels,
-        token_labels,
-    ):
-        model = XLNetModel(config=config)
-        model.eval()
-
-        # first forward pass
-        causal_mask = ops.ones(
-            input_ids_1.shape[0],
-            input_ids_1.shape[1],
-            input_ids_1.shape[1],
-            dtype=mindspore.float32,
-        )
-        causal_mask = ops.triu(causal_mask, diagonal=0)
-        outputs_cache = model(input_ids_1, use_mems=True, perm_mask=causal_mask)
-        outputs_no_cache = model(input_ids_1, use_mems=False, perm_mask=causal_mask)
-        outputs_conf = model(input_ids_1)
-
-        self.parent.assertTrue(len(outputs_cache) == len(outputs_conf))
-        self.parent.assertTrue(len(outputs_cache) == len(outputs_no_cache) + 1)
-
-        output, mems = outputs_cache.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and token_type_ids
-        next_input_ids = ops.cat([input_ids_1, next_tokens], dim=-1)
-
-        # causal mask
-        causal_mask = ops.ones(
-            input_ids_1.shape[0],
-            input_ids_1.shape[1] + 1,
-            input_ids_1.shape[1] + 1,
-            dtype=mindspore.float32,
-        )
-        causal_mask = ops.triu(causal_mask, diagonal=0)
-        single_mask = ops.ones(input_ids_1.shape[0], 1, 1, dtype=mindspore.float32)
-
-        # second forward pass
-        output_from_no_past = model(next_input_ids, perm_mask=causal_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, mems=mems, perm_mask=single_mask)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx]
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx]
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(ops.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_xlnet_base_model_with_att_output(
-        self,
-        config,
-        input_ids_1,
-        input_ids_2,
-        input_ids_q,
-        perm_mask,
-        input_mask,
-        target_mapping,
-        segment_ids,
-        lm_labels,
-        sequence_labels,
-        is_impossible_labels,
-        token_labels,
-    ):
-        model = XLNetModel(config)
-        model.eval()
-
-        attentions = model(input_ids_1, target_mapping=target_mapping, output_attentions=True)["attentions"]
-
-        self.parent.assertEqual(len(attentions), config.n_layer)
-        self.parent.assertIsInstance(attentions[0], tuple)
-        self.parent.assertEqual(len(attentions[0]), 2)
-        self.parent.assertTrue(attentions[0][0].shape, attentions[0][0].shape)
-
-    def create_and_check_xlnet_lm_head(
-        self,
-        config,
-        input_ids_1,
-        input_ids_2,
-        input_ids_q,
-        perm_mask,
-        input_mask,
-        target_mapping,
-        segment_ids,
-        lm_labels,
-        sequence_labels,
-        is_impossible_labels,
-        token_labels,
-    ):
-        model = XLNetLMHeadModel(config)
-        model.eval()
-
-        result1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
-
-        result2 = model(input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=result1.mems)
-
-        _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping)
-
-        self.parent.assertEqual(result1.loss.shape, ())
-        self.parent.assertEqual(result1.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in result1.mems],
-            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-        self.parent.assertEqual(result2.loss.shape, ())
-        self.parent.assertEqual(result2.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in result2.mems],
-            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def create_and_check_xlnet_qa(
-        self,
-        config,
-        input_ids_1,
-        input_ids_2,
-        input_ids_q,
-        perm_mask,
-        input_mask,
-        target_mapping,
-        segment_ids,
-        lm_labels,
-        sequence_labels,
-        is_impossible_labels,
-        token_labels,
-    ):
-        model = XLNetForQuestionAnswering(config)
-        model.eval()
-
-        result = model(input_ids_1)
-
-        result_with_labels = model(
-            input_ids_1,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-            cls_index=sequence_labels,
-            is_impossible=is_impossible_labels,
-            p_mask=input_mask,
-        )
-
-        result_with_labels = model(
-            input_ids_1,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-            cls_index=sequence_labels,
-            is_impossible=is_impossible_labels,
-        )
-
-        total_loss, mems = result_with_labels.to_tuple()
-
-        result_with_labels = model(
-            input_ids_1,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-
-        total_loss, mems = result_with_labels.to_tuple()
-
-        self.parent.assertEqual(result_with_labels.loss.shape, ())
-        self.parent.assertEqual(result.start_top_log_probs.shape, (self.batch_size, model.config.start_n_top))
-        self.parent.assertEqual(result.start_top_index.shape, (self.batch_size, model.config.start_n_top))
-        self.parent.assertEqual(
-            result.end_top_log_probs.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top)
-        )
-        self.parent.assertEqual(
-            result.end_top_index.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top)
-        )
-        self.parent.assertEqual(result.cls_logits.shape, (self.batch_size,))
-        self.parent.assertListEqual(
-            [mem.shape for mem in result.mems],
-            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def create_and_check_xlnet_token_classif(
-        self,
-        config,
-        input_ids_1,
-        input_ids_2,
-        input_ids_q,
-        perm_mask,
-        input_mask,
-        target_mapping,
-        segment_ids,
-        lm_labels,
-        sequence_labels,
-        is_impossible_labels,
-        token_labels,
-    ):
-        model = XLNetForTokenClassification(config)
-        model.eval()
-
-        result = model(input_ids_1)
-        result = model(input_ids_1, labels=token_labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.type_sequence_label_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in result.mems],
-            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def create_and_check_xlnet_sequence_classif(
-        self,
-        config,
-        input_ids_1,
-        input_ids_2,
-        input_ids_q,
-        perm_mask,
-        input_mask,
-        target_mapping,
-        segment_ids,
-        lm_labels,
-        sequence_labels,
-        is_impossible_labels,
-        token_labels,
-    ):
-        model = XLNetForSequenceClassification(config)
-        model.eval()
-
-        result = model(input_ids_1)
-        result = model(input_ids_1, labels=sequence_labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in result.mems],
-            [(self.seq_length, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids_1,
-            input_ids_2,
-            input_ids_q,
-            perm_mask,
-            input_mask,
-            target_mapping,
-            segment_ids,
-            lm_labels,
-            sequence_labels,
-            is_impossible_labels,
-            token_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids_1}
-        return config, inputs_dict
-
-
-@require_mindspore
-class XLNetModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            XLNetModel,
-            XLNetLMHeadModel,
-            XLNetForTokenClassification,
-            XLNetForSequenceClassification,
-            XLNetForQuestionAnswering,
-            XLNetForQuestionAnsweringSimple,
-            XLNetForMultipleChoice,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (
-        (XLNetLMHeadModel,) if is_mindspore_available() else ()
-    )  # TODO (PVP): Check other models whether language generation is also applicable
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": XLNetModel,
-            "question-answering": XLNetForQuestionAnsweringSimple,
-            "text-classification": XLNetForSequenceClassification,
-            "text-generation": XLNetLMHeadModel,
-            "token-classification": XLNetForTokenClassification,
-            "zero-shot": XLNetForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-    fx_compatible = False
-    test_pruning = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
-            return True
-
-        return False
-
-    # XLNet has 2 QA models -> need to manually set the correct labels for one of them here
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "XLNetForQuestionAnswering":
-                inputs_dict["start_positions"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-                inputs_dict["end_positions"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = XLNetModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_xlnet_base_model(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
-
-    def test_xlnet_base_model_use_mems(self):
-        # checking that in auto-regressive mode, `use_mems` gives the same results
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_model_use_mems(*config_and_inputs)
-
-    def test_seq_classification_use_mems_train(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_use_mems_train(*config_and_inputs)
-
-    def test_xlnet_base_model_with_att_output(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_base_model_with_att_output(*config_and_inputs)
-
-    def test_xlnet_lm_head(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
-
-    def test_xlnet_sequence_classif(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
-
-    def test_xlnet_token_classif(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_token_classif(*config_and_inputs)
-
-    def test_xlnet_qa(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
-
-    @unittest.skip(reason="xlnet cannot keep gradients in attentions or hidden states")
-    def test_retain_grad_hidden_states_attentions(self):
-        return
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-
-        for param in ["q", "k", "v", "o", "r", "r_r_bias", "r_s_bias", "r_w_bias", "seg_embed", "mask_emb"]:
-            if hasattr(module, param) and getattr(module, param) is not None:
-                weight = getattr(module, param)
-                weight.data.fill_(3)
-
-    def _check_hidden_states_for_generate(
-        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        self.assertIsInstance(hidden_states, tuple)
-        self.assertListEqual(
-            [isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states],
-            [True] * len(hidden_states),
-        )
-        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
-
-        for idx, iter_hidden_states in enumerate(hidden_states):
-            # check hidden size
-            for i, layer_hidden_states in enumerate(iter_hidden_states):
-                # every 2nd tensor is from extra stream
-                if i % 2 != 0:
-                    seq_len = 1
-                else:
-                    # for first item dummy PAD token is appended so need one more
-                    # else offset+dummy_token when using cache
-                    seq_len = (min_length + 1) if idx == 0 else 3
-
-                expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
-                self.assertEqual(layer_hidden_states.shape, expected_shape)
-
-    def _check_attentions_for_generate(
-        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        self.assertIsInstance(attentions, tuple)
-        self.assertListEqual(
-            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
-        )
-        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
-
-        for idx, attentions_item in enumerate(attentions):
-            for iter_attentions in attentions_item:
-                tgt_len = min_length
-
-                # for first item dummy PAD token is appended so need one more
-                # every token after consists of offset+dummy_token length when using cache
-                if idx == 0:
-                    tgt_len += 1
-                else:
-                    tgt_len = 3
-
-                src_len = min_length + idx + 1
-
-                expected_shape = (
-                    batch_size * num_beam_groups,
-                    config.num_attention_heads,
-                    tgt_len,
-                    src_len,
-                )
-                # check attn size
-                self.assertListEqual(
-                    [layer_attention.shape for layer_attention in iter_attentions],
-                    [expected_shape] * len(iter_attentions),
-                )
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "xlnet/xlnet-base-cased"
-        model = XLNetModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-@require_mindspore
-class XLNetModelLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_xlnet_base_cased(self):
-        model = XLNetLMHeadModel.from_pretrained("xlnet/xlnet-base-cased")
-        # fmt: off
-        input_ids = mindspore.tensor(
-            [
-                [
-                    67, 2840, 19, 18, 1484, 20, 965, 29077, 8719, 1273, 21, 45, 273, 17, 10, 15048, 28, 27511, 21, 4185, 11, 41, 2444, 9, 32, 1025, 20, 8719, 26, 23, 673, 966, 19, 29077, 20643, 27511, 20822, 20643, 19, 17, 6616, 17511, 18, 8978, 20, 18, 777, 9, 19233, 1527, 17669, 19, 24, 673, 17, 28756, 150, 12943, 4354, 153, 27, 442, 37, 45, 668, 21, 24, 256, 20, 416, 22, 2771, 4901, 9, 12943, 4354, 153, 51, 24, 3004, 21, 28142, 23, 65, 20, 18, 416, 34, 24, 2958, 22947, 9, 1177, 45, 668, 3097, 13768, 23, 103, 28, 441, 148, 48, 20522, 19, 12943, 4354, 153, 12860, 34, 18, 326, 27, 17492, 684, 21, 6709, 9, 8585, 123, 266, 19, 12943, 4354, 153, 6872, 24, 3004, 20, 18, 9225, 2198, 19, 12717, 103, 22, 401, 24, 6348, 9, 12943, 4354, 153, 1068, 2768, 2286, 19, 33, 104, 19, 176, 24, 9313, 19, 20086, 28, 45, 10292, 9, 4, 3,
-                ]
-            ],
-            dtype=mindspore.int64,
-        )
-        # fmt: on
-        #  In 1991, the remains of Russian Tsar Nicholas II and his family
-        #  (except for Alexei and Maria) are discovered.
-        #  The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
-        #  remainder of the story. 1883 Western Siberia,
-        #  a young Grigori Rasputin is asked by his father and a group of men to perform magic.
-        #  Rasputin has a vision and denounces one of the men as a horse thief. Although his
-        #  father initially slaps him for making such an accusation, Rasputin watches as the
-        #  man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-        #  the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
-        #  with people, even a bishop, begging for his blessing. """
-
-        # fmt: off
-        expected_output_ids = [
-            67, 2840, 19, 18, 1484, 20, 965, 29077, 8719, 1273, 21, 45, 273, 17, 10, 15048, 28, 27511, 21, 4185, 11, 41, 2444, 9, 32, 1025, 20, 8719, 26, 23, 673, 966, 19, 29077, 20643, 27511, 20822, 20643, 19, 17, 6616, 17511, 18, 8978, 20, 18, 777, 9, 19233, 1527, 17669, 19, 24, 673, 17, 28756, 150, 12943, 4354, 153, 27, 442, 37, 45, 668, 21, 24, 256, 20, 416, 22, 2771, 4901, 9, 12943, 4354, 153, 51, 24, 3004, 21, 28142, 23, 65, 20, 18, 416, 34, 24, 2958, 22947, 9, 1177, 45, 668, 3097, 13768, 23, 103, 28, 441, 148, 48, 20522, 19, 12943, 4354, 153, 12860, 34, 18, 326, 27, 17492, 684, 21, 6709, 9, 8585, 123, 266, 19, 12943, 4354, 153, 6872, 24, 3004, 20, 18, 9225, 2198, 19, 12717, 103, 22, 401, 24, 6348, 9, 12943, 4354, 153, 1068, 2768, 2286, 19, 33, 104, 19, 176, 24, 9313, 19, 20086, 28, 45, 10292, 9, 4, 3, 19, 12943, 4354, 153, 27, 442, 22, 2771, 4901, 9, 69, 27, 442, 22, 2771, 24, 11335, 20, 18, 9225, 2198, 9, 69, 27, 442, 22, 2771, 24, 11335, 20, 18, 9225, 2198, 9, 69, 27, 442, 22, 2771,
-        ]
-        # fmt: on
-        #  In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria)
-        #  are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich,
-        #  narrates the remainder of the story. 1883 Western Siberia, a young Grigori Rasputin
-        #  is asked by his father and a group of men to perform magic. Rasputin has a vision and
-        #  denounces one of the men as a horse thief. Although his father initially slaps
-        #  him for making such an accusation, Rasputin watches as the man is chased outside and beaten.
-        #  Twenty years later, Rasputin sees a vision of the Virgin Mary, prompting him to become a priest.
-        #  Rasputin quickly becomes famous, with people, even a bishop, begging for his blessing.
-        #  <sep><cls>, Rasputin is asked to perform magic. He is asked to perform a ritual of the Virgin Mary.
-        #  He is asked to perform a ritual of the Virgin Mary. He is asked to perform
-
-        output_ids = model.generate(input_ids, max_length=200, do_sample=False)
-        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
\ No newline at end of file
diff --git a/tests/transformers/models/xlnet/test_tokenization_xlnet.py b/tests/transformers/models/xlnet/test_tokenization_xlnet.py
deleted file mode 100644
index f6d20fc8c..000000000
--- a/tests/transformers/models/xlnet/test_tokenization_xlnet.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""test xlnet tokenization"""
-
-import unittest
-
-from mindnlp.transformers import XLNetTokenizer, XLNetTokenizerFast
-from mindnlp.utils.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
-
-SPIECE_UNDERLINE = "▁"
-
-
-@require_sentencepiece
-@require_tokenizers
-class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "xlnet/xlnet-base-cased"
-    tokenizer_class = XLNetTokenizer
-    rust_tokenizer_class = XLNetTokenizerFast
-    test_rust_tokenizer = True
-    test_sentencepiece = True
-
-    def setUp(self):
-        super().setUp()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = "<s>"
-        token_id = 1
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "<unk>")
-        self.assertEqual(vocab_keys[1], "<s>")
-        self.assertEqual(vocab_keys[-1], "<eod>")
-        self.assertEqual(len(vocab_keys), 1_006)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 1_000)
-
-    def test_full_tokenizer(self):
-        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
-
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "9",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "é",
-                ".",
-            ],
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4])
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "<unk>",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "<unk>",
-                ".",
-            ],
-        )
-
-    def test_tokenizer_lower(self):
-        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens,
-            [
-                SPIECE_UNDERLINE + "",
-                "i",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "9",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "se",
-                ".",
-            ],
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["▁he", "ll", "o"])
-
-    def test_tokenizer_no_lower(self):
-        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "9",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "se",
-                ".",
-            ],
-        )
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = XLNetTokenizer.from_pretrained("xlnet/xlnet-base-cased")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == text + [4, 3]
-        assert encoded_pair == text + [4] + text_2 + [4, 3]
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[17, 21442, 270, 17, 10, 14645, 318, 34, 17, 4546, 3145, 787, 13, 7752,
-                                            22018, 23, 21, 17, 4546, 3145, 787, 13, 3352, 14431, 13, 5500, 11, 1176,
-                                            580, 13, 16819, 4797, 23, 17, 10, 17135, 658, 19, 457, 7932, 13, 184, 19,
-                                            3154, 17135, 6468, 19, 1404, 12269, 19, 4229, 5356, 16264, 46, 19, 17,
-                                            20545, 10395, 9, 9, 9, 11, 28, 6421, 9531, 20729, 17, 10, 353, 17022, 11,
-                                            21, 6421, 9531, 16949, 17, 10, 11509, 753, 11, 33, 95, 2421, 7385, 956,
-                                            14431, 2626, 25, 842, 7385, 4836, 21, 1429, 2272, 9855, 3120, 161, 24738,
-                                            19, 13203, 658, 218, 787, 21, 430, 18482, 847, 2637, 9, 4, 3],
-                                           [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-                                            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-                                            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-                                            5, 5, 5, 5, 322, 22178, 27, 1064, 22, 956, 13, 11101, 1429, 5854, 24313,
-                                            18953, 40, 422, 24366, 68, 1758, 37, 10483, 14257, 31, 207, 263, 21, 203,
-                                            3773, 25, 71, 9735, 9, 4, 3],
-                                           [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-                                            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-                                            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-                                            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 32, 2049,
-                                            3442, 17, 13894, 3380, 23, 95, 18, 17634, 2288, 9, 4, 3]],
-                             'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
-                                                [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-                                                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-                                                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-                                                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
-                                                [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-                                                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-                                                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-                                                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-                                                 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]],
-                             'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                                 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # fmt: skip
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="xlnet/xlnet-base-cased",
-            revision="c841166438c31ec7ca9a106dee7bb312b73ae511",
-        )
diff --git a/tests/transformers/models/xmod/__init__.py b/tests/transformers/models/xmod/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/xmod/test_modeling_xmod.py b/tests/transformers/models/xmod/test_modeling_xmod.py
deleted file mode 100644
index 41ad3600a..000000000
--- a/tests/transformers/models/xmod/test_modeling_xmod.py
+++ /dev/null
@@ -1,674 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-import numpy as np
-from mindnlp.transformers import XLMRobertaTokenizer
-from mindnlp.utils import is_mindspore_available
-from mindnlp.utils.testing_utils import require_sentencepiece, require_tokenizers, require_mindspore, slow
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
-# from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-    from mindnlp.transformers import (
-        XmodConfig,
-        XmodForCausalLM,
-        XmodForMaskedLM,
-        XmodForMultipleChoice,
-        XmodForQuestionAnswering,
-        XmodForSequenceClassification,
-        XmodForTokenClassification,
-        XmodModel,
-    )
-    from mindnlp.transformers.models.xmod.modeling_xmod import XmodEmbeddings, create_position_ids_from_input_ids
-
-
-class XmodModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        seq_length=7,
-        is_training=True,
-        use_input_mask=True,
-        use_token_type_ids=True,
-        use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
-        type_sequence_label_size=2,
-        initializer_range=0.02,
-        num_labels=3,
-        num_choices=4,
-        scope=None,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_input_mask = use_input_mask
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
-
-    def prepare_config_and_inputs(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-        token_type_ids = None
-        if self.use_token_type_ids:
-            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-        sequence_labels = None
-        token_labels = None
-        choice_labels = None
-        if self.use_labels:
-            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-            choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-        config = self.get_config()
-
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-
-    def get_config(self):
-        return XmodConfig(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            initializer_range=self.initializer_range,
-            default_language="en_XX",
-        )
-
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_model(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = XmodModel(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = XmodModel(config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = XmodForCausalLM(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = XmodForCausalLM(config=config).set_train(False)
-
-        # make sure that ids don't start with pad token
-        mask = input_ids.ne(config.pad_token_id).long()
-        input_ids = input_ids * mask
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-
-        # make sure that ids don't start with pad token
-        mask = next_tokens.ne(config.pad_token_id).long()
-        next_tokens = next_tokens * mask
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = ops.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = ops.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
-        output_from_past_slice = output_from_past[:, :, random_slice_idx]
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(np.allclose(output_from_past_slice.asnumpy(), output_from_no_past_slice.asnumpy(), atol=1e-3))
-
-    def create_and_check_for_masked_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = XmodForMaskedLM(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = XmodForTokenClassification(config=config)
-
-        model.set_train(False)
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = XmodForMultipleChoice(config=config)
-
-        model.set_train(False)
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        multiple_choice_input_mask = input_mask.unsqueeze(1).broadcast_to((-1, self.num_choices, -1))
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
-    def create_and_check_for_question_answering(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = XmodForQuestionAnswering(config=config)
-
-        model.set_train(False)
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
-        return config, inputs_dict
-
-
-@require_mindspore
-class XmodModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            XmodForCausalLM,
-            XmodForMaskedLM,
-            XmodModel,
-            XmodForSequenceClassification,
-            XmodForTokenClassification,
-            XmodForMultipleChoice,
-            XmodForQuestionAnswering,
-        )
-        if is_mindspore_available()
-        else ()
-    )
-    all_generative_model_classes = (XmodForCausalLM,) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": XmodModel,
-            "fill-mask": XmodForMaskedLM,
-            "question-answering": XmodForQuestionAnswering,
-            "text-classification": XmodForSequenceClassification,
-            "text-generation": XmodForCausalLM,
-            "token-classification": XmodForTokenClassification,
-            "zero-shot": XmodForSequenceClassification,
-        }
-        if is_mindspore_available()
-        else {}
-    )
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = XmodModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=XmodConfig, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_various_embeddings(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        for type in ["absolute", "relative_key", "relative_key_query"]:
-            config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def test_for_causal_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs_relative_pos_emb(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        config_and_inputs[0].position_embedding_type = "relative_key"
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_create_position_ids_respects_padding_index(self):
-        """This is a regression test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is XmodEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = XmodEmbeddings(config=config)
-
-        input_ids = mindspore.Tensor([[12, 31, 13, model.padding_idx]])
-        expected_positions = mindspore.Tensor(
-            [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-        )
-
-        position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(ops.all(ops.equal(position_ids, expected_positions)))
-
-    def test_create_position_ids_from_inputs_embeds(self):
-        """This is a regression test for https://github.com/huggingface/transformers/issues/1761
-
-        The position ids should be masked with the embedding object's padding index. Therefore, the
-        first available non-padding position index is XmodEmbeddings.padding_idx + 1
-        """
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = XmodEmbeddings(config=config)
-
-        inputs_embeds = ops.randn(2, 4, 30)
-        expected_single_positions = [
-            0 + embeddings.padding_idx + 1,
-            1 + embeddings.padding_idx + 1,
-            2 + embeddings.padding_idx + 1,
-            3 + embeddings.padding_idx + 1,
-        ]
-        expected_positions = mindspore.Tensor([expected_single_positions, expected_single_positions])
-        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
-        self.assertEqual(position_ids.shape, expected_positions.shape)
-        self.assertTrue(ops.all(ops.equal(position_ids, expected_positions)))
-
-    def test_set_default_language(self):
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = XmodForMaskedLM(config=config)
-        model.set_default_language("en_XX")
-        self.assertEqual(model.config.default_language, "en_XX")
-        with self.assertRaises(ValueError):
-            model.set_default_language("xx_XX")
-
-    def test_freeze_embeddings_and_language_adapters(self):
-        config = self.model_tester.prepare_config_and_inputs()[0]
-        model = XmodForMaskedLM(config=config)
-        num_trainable_params_before = sum(p.numel() for p in model.get_parameters() if p.requires_grad)
-        model.freeze_embeddings_and_language_adapters()
-        num_trainable_params_after = sum(p.numel() for p in model.get_parameters() if p.requires_grad)
-        self.assertLess(num_trainable_params_after, num_trainable_params_before)
-
-
-@require_sentencepiece
-@require_tokenizers
-@require_mindspore
-class XmodModelIntegrationTest(unittest.TestCase):
-    @slow
-    def test_xmod_base(self):
-        model = XmodModel.from_pretrained("facebook/xmod-base")
-
-        # language en_XX
-        model.set_default_language("en_XX")
-        input_ids = mindspore.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
-        # The dog is cute and lives in the garden house
-        expected_output_shape = (1, 12, 768)  # batch_size, sequence_length, embedding_vector_dim
-        expected_output_values_last_dim = mindspore.tensor(
-            [[-0.2394, -0.0036, 0.1252, -0.0087, 0.1325, 0.0580, -0.2049, -0.1978, -0.1223, 0.0648, -0.2599, -0.3724]]
-        )
-        output = model(input_ids)["last_hidden_state"]
-        self.assertEqual(output.shape, expected_output_shape)
-        # compare the actual values for a slice of last dim
-        self.assertTrue(np.allclose(output[:, :, -1].asnumpy(), expected_output_values_last_dim.asnumpy(), atol=1e-3))
-
-        # language de_DE
-        model.set_default_language("de_DE")
-        input_ids = mindspore.tensor([[0, 1310, 49083, 443, 269, 71, 5486, 165, 60429, 660, 23, 2315, 58761, 18391, 5, 2]])
-        # Der Hund ist niedlich und wohnt in einem Gartenhaus.
-        expected_output_shape = (1, 16, 768)  # batch_size, sequence_length, embedding_vector_dim
-        # fmt: off
-        expected_output_values_last_dim = mindspore.tensor(
-            [[0.0162, 0.0075, -0.1882, 0.2335, -0.0952, -0.3994, -0.0317, -0.1174, 0.0177, 0.4280, -0.0240, -0.2138,
-              0.0785, -0.1045, -0.2811, -0.3220]]
-        )
-        # fmt: on
-        output = model(input_ids)["last_hidden_state"]
-        self.assertEqual(output.shape, expected_output_shape)
-        # compare the actual values for a slice of last dim
-        self.assertTrue(np.allclose(output[:, :, -1].asnumpy(), expected_output_values_last_dim.asnumpy(), atol=1e-3))
-
-    @slow
-    def test_xmod_large_prenorm(self):
-        model = XmodModel.from_pretrained("facebook/xmod-large-prenorm")
-
-        # language en_XX
-        model.set_default_language("en_XX")
-        input_ids = mindspore.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
-        # The dog is cute and lives in the garden house
-        expected_output_shape = (1, 12, 1024)  # batch_size, sequence_length, embedding_vector_dim
-        # fmt: off
-        expected_output_values_last_dim = mindspore.tensor(
-            [[-0.0121, -0.0194, -0.0240, -0.0160, -0.0205, -0.0159, -0.0243, -0.0206, -0.0161, -0.0335, -0.0196,
-              -0.0141]]
-        )
-        # fmt: on
-        output = model(input_ids)["last_hidden_state"]
-        self.assertEqual(output.shape, expected_output_shape)
-        # compare the actual values for a slice of last dim
-        self.assertTrue(np.allclose(output[:, :, -1].asnumpy(), expected_output_values_last_dim.asnumpy(), atol=1e-3))
-
-        # language de_DE
-        model.set_default_language("de_DE")
-        input_ids = mindspore.tensor([[0, 1310, 49083, 443, 269, 71, 5486, 165, 60429, 660, 23, 2315, 58761, 18391, 5, 2]])
-        # Der Hund ist niedlich und wohnt in einem Gartenhaus.
-        expected_output_shape = (1, 16, 1024)  # batch_size, sequence_length, embedding_vector_dim
-        # fmt: off
-        expected_output_values_last_dim = mindspore.tensor(
-            [[-0.0120, -0.0262, -0.0253, -0.0112, -0.0128, -0.0164, -0.0080, -0.0081, -0.0192, -0.0117, -0.0170,
-              -0.0120, -0.0210, -0.0173, -0.0078, -0.0122]]
-        )
-        # fmt: on
-        output = model(input_ids)["last_hidden_state"]
-        self.assertEqual(output.shape, expected_output_shape)
-        # compare the actual values for a slice of last dim
-        self.assertTrue(np.allclose(output[:, :, -1].asnumpy(), expected_output_values_last_dim.asnumpy(), atol=1e-3))
-
-    @slow
-    def test_multilingual_batch(self):
-        model = XmodModel.from_pretrained("facebook/xmod-base")
-        # fmt: off
-        input_ids = mindspore.tensor([
-            [0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2],
-            [0, 1310, 49083, 443, 269, 71, 5486, 165, 60429, 660, 23, 2],
-            [0, 1310, 49083, 443, 269, 71, 5486, 165, 60429, 660, 23, 2],
-            [0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2],
-        ])
-        # fmt: on
-        lang_ids = mindspore.Tensor([0, 8, 8, 0])
-        expected_output_shape = (4, 12, 768)  # batch_size, sequence_length, embedding_vector_dim
-        # fmt: off
-        expected_output_values_last_dim = mindspore.tensor([
-            [-0.2394, -0.0036, 0.1252, -0.0087, 0.1325, 0.0580, -0.2049, -0.1978, -0.1223, 0.0648, -0.2599, -0.3724],
-            [-0.2668, -0.0235, -0.1739, 0.2266, -0.0901, -0.3482, 0.0105, -0.1915, 0.0397, 0.3822, 0.1836, -0.3407],
-            [-0.2668, -0.0235, -0.1739, 0.2266, -0.0901, -0.3482, 0.0105, -0.1915, 0.0397, 0.3822, 0.1836, -0.3407],
-            [-0.2394, -0.0036, 0.1252, -0.0087, 0.1325, 0.0580, -0.2049, -0.1978, -0.1223, 0.0648, -0.2599, -0.3724],
-        ])
-        # fmt: on
-        output = model(input_ids, lang_ids=lang_ids)["last_hidden_state"]
-        self.assertEqual(output.shape, expected_output_shape)
-        # compare the actual values for a slice of last dim
-        self.assertTrue(np.allclose(output[:, :, -1].asnumpy(), expected_output_values_last_dim.asnumpy(), atol=1e-3))
-
-    @slow
-    def test_end_to_end_mask_fill(self):
-        tokenizer = XLMRobertaTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
-        model = XmodForMaskedLM.from_pretrained("facebook/xmod-base", default_language="en_XX")
-
-        sentences = [
-            "Hello, my dog is a little <mask>.",
-            "Hi <mask>!",
-        ]
-
-        inputs = tokenizer(sentences, return_tensors="ms", padding=True)
-        input_ids = inputs["input_ids"]
-
-        outputs = model(
-            input_ids=input_ids,
-            attention_mask=inputs["attention_mask"],
-        )
-        probs = outputs.logits.softmax(axis=-1)
-        _, predictions = probs.topk(1)
-        predictions = predictions.squeeze(-1)
-
-        inputs_non_padded = tokenizer(sentences[0], return_tensors="ms").input_ids
-        output_non_padded = model(input_ids=inputs_non_padded)
-        probs_non_padded = output_non_padded.logits.softmax(axis=-1)
-        _, predictions_non_padded = probs_non_padded.topk(1)
-        predictions_non_padded = predictions_non_padded.squeeze(-1)
-
-        inputs_padded = tokenizer(sentences[1], return_tensors="ms").input_ids
-        output_padded = model(input_ids=inputs_padded)
-        probs_padded = output_padded.logits.softmax(axis=-1)
-        _, predictions_padded = probs_padded.topk(1)
-        predictions_padded = predictions_padded.squeeze(-1)
-
-        batch_out_sentence = tokenizer.batch_decode(predictions, skip_special_tokens=True)
-        non_padded_sentence = tokenizer.decode(predictions_non_padded[0], skip_special_tokens=True)
-        padded_sentence = tokenizer.decode(predictions_padded[0], skip_special_tokens=True)
-
-        expected_output_sentence = [
-            "Hello, my dog is a little girl.",
-            "Hi everyone!",
-        ]
-        self.assertListEqual(expected_output_sentence, batch_out_sentence)
-        self.assertListEqual(batch_out_sentence, [non_padded_sentence, padded_sentence])
diff --git a/tests/transformers/models/yolos/__init__.py b/tests/transformers/models/yolos/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/models/yolos/test_image_processing_yolos.py b/tests/transformers/models/yolos/test_image_processing_yolos.py
deleted file mode 100644
index 691643da0..000000000
--- a/tests/transformers/models/yolos/test_image_processing_yolos.py
+++ /dev/null
@@ -1,532 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import pathlib
-import unittest
-import numpy as np
-
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindspore import ops
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import YolosImageProcessor
-
-
-class YolosImageProcessingTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=7,
-        num_channels=3,
-        min_resolution=30,
-        max_resolution=400,
-        do_resize=True,
-        size=None,
-        do_normalize=True,
-        image_mean=[0.5, 0.5, 0.5],
-        image_std=[0.5, 0.5, 0.5],
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_pad=True,
-    ):
-        # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
-        size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
-        self.parent = parent
-        self.batch_size = batch_size
-        self.num_channels = num_channels
-        self.min_resolution = min_resolution
-        self.max_resolution = max_resolution
-        self.do_resize = do_resize
-        self.size = size
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_pad = do_pad
-
-    def prepare_image_processor_dict(self):
-        return {
-            "do_resize": self.do_resize,
-            "size": self.size,
-            "do_normalize": self.do_normalize,
-            "image_mean": self.image_mean,
-            "image_std": self.image_std,
-            "do_rescale": self.do_rescale,
-            "rescale_factor": self.rescale_factor,
-            "do_pad": self.do_pad,
-        }
-
-    def get_expected_values(self, image_inputs, batched=False):
-        """
-        This function computes the expected height and width when providing images to YolosImageProcessor,
-        assuming do_resize is set to True with a scalar size.
-        """
-        if not batched:
-            image = image_inputs[0]
-            if isinstance(image, Image.Image):
-                width, height = image.size
-            else:
-                height, width = image.shape[1], image.shape[2]
-
-            size = self.size["shortest_edge"]
-            max_size = self.size.get("longest_edge", None)
-            if max_size is not None:
-                min_original_size = float(min((height, width)))
-                max_original_size = float(max((height, width)))
-                if max_original_size / min_original_size * size > max_size:
-                    size = int(round(max_size * min_original_size / max_original_size))
-
-            if width < height and width != size:
-                height = int(size * height / width)
-                width = size
-            elif height < width and height != size:
-                width = int(size * width / height)
-                height = size
-            width_mod = width % 16
-            height_mod = height % 16
-            expected_width = width - width_mod
-            expected_height = height - height_mod
-
-        else:
-            expected_values = []
-            for image in image_inputs:
-                expected_height, expected_width = self.get_expected_values([image])
-                expected_values.append((expected_height, expected_width))
-            expected_height = max(expected_values, key=lambda item: item[0])[0]
-            expected_width = max(expected_values, key=lambda item: item[1])[1]
-
-        return expected_height, expected_width
-
-    def expected_output_image_shape(self, images):
-        height, width = self.get_expected_values(images, batched=True)
-        return self.num_channels, height, width
-
-    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
-        return prepare_image_inputs(
-            batch_size=self.batch_size,
-            num_channels=self.num_channels,
-            min_resolution=self.min_resolution,
-            max_resolution=self.max_resolution,
-            equal_resolution=equal_resolution,
-            numpify=numpify,
-            torchify=torchify,
-        )
-
-
-@require_mindspore
-@require_vision
-class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
-    image_processing_class = YolosImageProcessor if is_vision_available() else None
-
-    def setUp(self):
-        self.image_processor_tester = YolosImageProcessingTester(self)
-
-    @property
-    def image_processor_dict(self):
-        return self.image_processor_tester.prepare_image_processor_dict()
-
-    def test_image_processor_properties(self):
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processing, "image_mean"))
-        self.assertTrue(hasattr(image_processing, "image_std"))
-        self.assertTrue(hasattr(image_processing, "do_normalize"))
-        self.assertTrue(hasattr(image_processing, "do_resize"))
-        self.assertTrue(hasattr(image_processing, "size"))
-
-    def test_image_processor_from_dict_with_kwargs(self):
-        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
-        self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
-        self.assertEqual(image_processor.do_pad, True)
-
-        image_processor = self.image_processing_class.from_dict(
-            self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False
-        )
-        self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
-        self.assertEqual(image_processor.do_pad, False)
-
-    def test_equivalence_padding(self):
-        # Initialize image_processings
-        image_processing_1 = self.image_processing_class(**self.image_processor_dict)
-
-        image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
-        # create random PyTorch tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True, torchify=False)
-
-        # Test whether the method "pad" and calling the image processor return the same tensors
-        encoded_images_with_method = image_processing_1.pad(image_inputs, return_tensors="ms")
-        encoded_images = image_processing_2(image_inputs, return_tensors="ms")
-
-        self.assertTrue(
-            np.allclose(encoded_images_with_method["pixel_values"].asnumpy(), encoded_images["pixel_values"].asnumpy(), atol=1e-4)
-        )
-
-    def test_resize_max_size_respected(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-
-        # create torch tensors as image
-        image = ops.randint(0, 256, (3, 100, 1500), dtype=mindspore.int64)
-        processed_image = image_processor(
-            image, size={"longest_edge": 1333, "shortest_edge": 800}, do_pad=False, return_tensors="ms"
-        )["pixel_values"]
-
-        self.assertTrue(processed_image.shape[-1] <= 1333)
-        self.assertTrue(processed_image.shape[-2] <= 800)
-
-    @slow
-    def test_call_pytorch_with_coco_detection_annotations(self):
-        # prepare image and target
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"image_id": 39769, "annotations": target}
-
-        # encode them
-        image_processing = YolosImageProcessor.from_pretrained("hustvl/yolos-small")
-        encoding = image_processing(images=image, annotations=target, return_tensors="ms")
-
-        # verify pixel values
-        expected_shape = (1, 3, 800, 1056)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = mindspore.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(np.allclose(encoding["pixel_values"][0, 0, 0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-        # verify area
-        expected_area = mindspore.tensor([5832.7256, 11144.6689, 484763.2500, 829269.8125, 146579.4531, 164177.6250])
-        self.assertTrue(np.allclose(encoding["labels"][0]["area"].asnumpy(), expected_area.asnumpy()))
-        # verify boxes
-        expected_boxes_shape = (6, 4)
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = mindspore.tensor([0.5503, 0.2765, 0.0604, 0.2215])
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"][0].asnumpy(), expected_boxes_slice.asnumpy(), atol=1e-3))
-        # verify image_id
-        expected_image_id = mindspore.tensor([39769])
-        self.assertTrue(np.allclose(encoding["labels"][0]["image_id"].asnumpy(), expected_image_id.asnumpy()))
-        # verify is_crowd
-        expected_is_crowd = mindspore.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(np.allclose(encoding["labels"][0]["iscrowd"].asnumpy(), expected_is_crowd.asnumpy()))
-        # verify class_labels
-        expected_class_labels = mindspore.tensor([75, 75, 63, 65, 17, 17])
-        self.assertTrue(np.allclose(encoding["labels"][0]["class_labels"].asnumpy(), expected_class_labels.asnumpy()))
-        # verify orig_size
-        expected_orig_size = mindspore.tensor([480, 640])
-        self.assertTrue(np.allclose(encoding["labels"][0]["orig_size"].asnumpy(), expected_orig_size.asnumpy()))
-        # verify size
-        expected_size = mindspore.tensor([800, 1056])
-        self.assertTrue(np.allclose(encoding["labels"][0]["size"].asnumpy(), expected_size.asnumpy()))
-
-    @slow
-    def test_call_pytorch_with_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        # encode them
-        image_processing = YolosImageProcessor(format="coco_panoptic")
-        encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="ms")
-
-        # verify pixel values
-        expected_shape = (1, 3, 800, 1056)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        expected_slice = mindspore.tensor([0.2796, 0.3138, 0.3481])
-        self.assertTrue(np.allclose(encoding["pixel_values"][0, 0, 0, :3].asnumpy(), expected_slice.asnumpy(), atol=1e-4))
-
-        # verify area
-        expected_area = mindspore.tensor([146591.5000, 163974.2500, 480092.2500, 11187.0000, 5824.5000, 7562.5000])
-        self.assertTrue(np.allclose(encoding["labels"][0]["area"].asnumpy(), expected_area.asnumpy()))
-        # verify boxes
-        expected_boxes_shape = (6, 4)
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
-        expected_boxes_slice = mindspore.tensor([0.2625, 0.5437, 0.4688, 0.8625])
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"][0].asnumpy(), expected_boxes_slice.asnumpy(), atol=1e-3))
-        # verify image_id
-        expected_image_id = mindspore.tensor([39769])
-        self.assertTrue(np.allclose(encoding["labels"][0]["image_id"].asnumpy(), expected_image_id.asnumpy()))
-        # verify is_crowd
-        expected_is_crowd = mindspore.tensor([0, 0, 0, 0, 0, 0])
-        self.assertTrue(np.allclose(encoding["labels"][0]["iscrowd"].asnumpy(), expected_is_crowd.asnumpy()))
-        # verify class_labels
-        expected_class_labels = mindspore.tensor([17, 17, 63, 75, 75, 93])
-        self.assertTrue(np.allclose(encoding["labels"][0]["class_labels"].asnumpy(), expected_class_labels.asnumpy()))
-        # verify masks
-        expected_masks_sum = 815161
-        self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
-        # verify orig_size
-        expected_orig_size = mindspore.tensor([480, 640])
-        self.assertTrue(np.allclose(encoding["labels"][0]["orig_size"].asnumpy(), expected_orig_size.asnumpy()))
-        # verify size
-        expected_size = mindspore.tensor([800, 1056])
-        self.assertTrue(np.allclose(encoding["labels"][0]["size"].asnumpy(), expected_size.asnumpy()))
-
-    # Output size is slight different from DETR as yolos takes mod of 16
-    @slow
-    def test_batched_coco_detection_annotations(self):
-        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
-
-        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        annotations_0 = {"image_id": 39769, "annotations": target}
-        annotations_1 = {"image_id": 39769, "annotations": target}
-
-        # Adjust the bounding boxes for the resized image
-        w_0, h_0 = image_0.size
-        w_1, h_1 = image_1.size
-        for i in range(len(annotations_1["annotations"])):
-            coords = annotations_1["annotations"][i]["bbox"]
-            new_bbox = [
-                coords[0] * w_1 / w_0,
-                coords[1] * h_1 / h_0,
-                coords[2] * w_1 / w_0,
-                coords[3] * h_1 / h_0,
-            ]
-            annotations_1["annotations"][i]["bbox"] = new_bbox
-
-        images = [image_0, image_1]
-        annotations = [annotations_0, annotations_1]
-
-        image_processing = YolosImageProcessor()
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            return_tensors="ms",  # do_convert_annotations=True
-        )
-
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1056
-        expected_shape = (2, 3, postprocessed_height, postprocessed_width)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        expected_boxes_0 = mindspore.tensor(
-            [
-                [0.6879, 0.4609, 0.0755, 0.3691],
-                [0.2118, 0.3359, 0.2601, 0.1566],
-                [0.5011, 0.5000, 0.9979, 1.0000],
-                [0.5010, 0.5020, 0.9979, 0.9959],
-                [0.3284, 0.5944, 0.5884, 0.8112],
-                [0.8394, 0.5445, 0.3213, 0.9110],
-            ]
-        )
-        expected_boxes_1 = mindspore.tensor(
-            [
-                [0.4169, 0.2765, 0.0458, 0.2215],
-                [0.1284, 0.2016, 0.1576, 0.0940],
-                [0.3792, 0.4933, 0.7559, 0.9865],
-                [0.3794, 0.5002, 0.7563, 0.9955],
-                [0.1990, 0.5456, 0.3566, 0.8646],
-                [0.5845, 0.4115, 0.3462, 0.7161],
-            ]
-        )
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"].asnumpy(), expected_boxes_0.asnumpy(), atol=1e-3))
-        self.assertTrue(np.allclose(encoding["labels"][1]["boxes"].asnumpy(), expected_boxes_1.asnumpy(), atol=1e-3))
-
-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, (6, 800, 1056))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, (6, 800, 1056))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="ms",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = ops.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = ops.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = ops.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = ops.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"].asnumpy(), expected_boxes_0.asnumpy(), atol=1))
-        self.assertTrue(np.allclose(encoding["labels"][1]["boxes"].asnumpy(), expected_boxes_1.asnumpy(), atol=1))
-
-    # Output size is slight different from DETR as yolos takes mod of 16
-    def test_batched_coco_panoptic_annotations(self):
-        # prepare image, target and masks_path
-        image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800))
-
-        with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
-            target = json.loads(f.read())
-
-        annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-        annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
-
-        w_0, h_0 = image_0.size
-        w_1, h_1 = image_1.size
-        for i in range(len(annotation_1["segments_info"])):
-            coords = annotation_1["segments_info"][i]["bbox"]
-            new_bbox = [
-                coords[0] * w_1 / w_0,
-                coords[1] * h_1 / h_0,
-                coords[2] * w_1 / w_0,
-                coords[3] * h_1 / h_0,
-            ]
-            annotation_1["segments_info"][i]["bbox"] = new_bbox
-
-        masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
-
-        images = [image_0, image_1]
-        annotations = [annotation_0, annotation_1]
-
-        # encode them
-        image_processing = YolosImageProcessor(format="coco_panoptic")
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_tensors="ms",
-            return_segmentation_masks=True,
-        )
-
-        # Check the pixel values have been padded
-        postprocessed_height, postprocessed_width = 800, 1056
-        expected_shape = (2, 3, postprocessed_height, postprocessed_width)
-        self.assertEqual(encoding["pixel_values"].shape, expected_shape)
-
-        # Check the bounding boxes have been adjusted for padded images
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        expected_boxes_0 = mindspore.tensor(
-            [
-                [0.2625, 0.5437, 0.4688, 0.8625],
-                [0.7719, 0.4104, 0.4531, 0.7125],
-                [0.5000, 0.4927, 0.9969, 0.9854],
-                [0.1688, 0.2000, 0.2063, 0.0917],
-                [0.5492, 0.2760, 0.0578, 0.2187],
-                [0.4992, 0.4990, 0.9984, 0.9979],
-            ]
-        )
-        expected_boxes_1 = mindspore.tensor(
-            [
-                [0.1591, 0.3262, 0.2841, 0.5175],
-                [0.4678, 0.2463, 0.2746, 0.4275],
-                [0.3030, 0.2956, 0.6042, 0.5913],
-                [0.1023, 0.1200, 0.1250, 0.0550],
-                [0.3329, 0.1656, 0.0350, 0.1312],
-                [0.3026, 0.2994, 0.6051, 0.5987],
-            ]
-        )
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"].asnumpy(), expected_boxes_0.asnumpy(), atol=1e-3))
-        self.assertTrue(np.allclose(encoding["labels"][1]["boxes"].asnumpy(), expected_boxes_1.asnumpy(), atol=1e-3))
-
-        # Check the masks have also been padded
-        self.assertEqual(encoding["labels"][0]["masks"].shape, (6, 800, 1056))
-        self.assertEqual(encoding["labels"][1]["masks"].shape, (6, 800, 1056))
-
-        # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height
-        # format and not in the range [0, 1]
-        encoding = image_processing(
-            images=images,
-            annotations=annotations,
-            masks_path=masks_path,
-            return_segmentation_masks=True,
-            do_convert_annotations=False,
-            return_tensors="ms",
-        )
-        self.assertEqual(encoding["labels"][0]["boxes"].shape, (6, 4))
-        self.assertEqual(encoding["labels"][1]["boxes"].shape, (6, 4))
-        # Convert to absolute coordinates
-        unnormalized_boxes_0 = ops.vstack(
-            [
-                expected_boxes_0[:, 0] * postprocessed_width,
-                expected_boxes_0[:, 1] * postprocessed_height,
-                expected_boxes_0[:, 2] * postprocessed_width,
-                expected_boxes_0[:, 3] * postprocessed_height,
-            ]
-        ).T
-        unnormalized_boxes_1 = ops.vstack(
-            [
-                expected_boxes_1[:, 0] * postprocessed_width,
-                expected_boxes_1[:, 1] * postprocessed_height,
-                expected_boxes_1[:, 2] * postprocessed_width,
-                expected_boxes_1[:, 3] * postprocessed_height,
-            ]
-        ).T
-        # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max
-        expected_boxes_0 = ops.vstack(
-            [
-                unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2,
-                unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2,
-                unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2,
-            ]
-        ).T
-        expected_boxes_1 = ops.vstack(
-            [
-                unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2,
-                unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2,
-                unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2,
-            ]
-        ).T
-        self.assertTrue(np.allclose(encoding["labels"][0]["boxes"].asnumpy(), expected_boxes_0.asnumpy(), rtol=1))
-        self.assertTrue(np.allclose(encoding["labels"][1]["boxes"].asnumpy(), expected_boxes_1.asnumpy(), rtol=1))
diff --git a/tests/transformers/models/yolos/test_modeling_yolos.py b/tests/transformers/models/yolos/test_modeling_yolos.py
deleted file mode 100644
index 36fb9ee0f..000000000
--- a/tests/transformers/models/yolos/test_modeling_yolos.py
+++ /dev/null
@@ -1,363 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Testing suite for the MindSpore YOLOS model. """
-
-
-import unittest
-import numpy as np
-
-from mindnlp.transformers import YolosConfig
-from mindnlp.utils.testing_utils import require_mindspore, require_vision, slow
-from mindnlp.utils import cached_property, is_mindspore_available, is_vision_available
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-
-    from mindnlp.transformers import YolosForObjectDetection, YolosModel
-
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers import AutoImageProcessor
-
-
-class YolosModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=13,
-        image_size=[30, 30],
-        patch_size=2,
-        num_channels=3,
-        is_training=True,
-        use_labels=True,
-        hidden_size=32,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        type_sequence_label_size=10,
-        initializer_range=0.02,
-        num_labels=3,
-        scope=None,
-        n_targets=8,
-        num_detection_tokens=10,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.scope = scope
-        self.n_targets = n_targets
-        self.num_detection_tokens = num_detection_tokens
-        # we set the expected sequence length (which is used in several tests)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + num_detection_tokens
-        num_patches = (image_size[1] // patch_size) * (image_size[0] // patch_size)
-        self.expected_seq_len = num_patches + 1 + self.num_detection_tokens
-
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size[0], self.image_size[1]])
-
-        labels = None
-        if self.use_labels:
-            # labels is a list of Dict (each Dict being the labels for a given example in the batch)
-            labels = []
-            for i in range(self.batch_size):
-                target = {}
-                target["class_labels"] = ops.randint(
-                    0, high=self.num_labels, size=(self.n_targets,)
-                )
-                target["boxes"] = ops.rand(self.n_targets, 4)
-                labels.append(target)
-
-        config = self.get_config()
-
-        return config, pixel_values, labels
-
-    def get_config(self):
-        return YolosConfig(
-            image_size=self.image_size,
-            patch_size=self.patch_size,
-            num_channels=self.num_channels,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            is_decoder=False,
-            initializer_range=self.initializer_range,
-            num_detection_tokens=self.num_detection_tokens,
-            num_labels=self.num_labels,
-        )
-
-    def create_and_check_model(self, config, pixel_values, labels):
-        model = YolosModel(config=config)
-        model.set_train(False)
-        result = model(pixel_values)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.expected_seq_len, self.hidden_size)
-        )
-
-    def create_and_check_for_object_detection(self, config, pixel_values, labels):
-        model = YolosForObjectDetection(config)
-        model.set_train(False)
-
-        result = model(pixel_values=pixel_values)
-        result = model(pixel_values)
-
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_detection_tokens, self.num_labels + 1))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_detection_tokens, 4))
-
-        result = model(pixel_values=pixel_values, labels=labels)
-
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_detection_tokens, self.num_labels + 1))
-        self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_detection_tokens, 4))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        config, pixel_values, labels = config_and_inputs
-        inputs_dict = {"pixel_values": pixel_values}
-        return config, inputs_dict
-
-
-@require_mindspore
-class YolosModelTest(ModelTesterMixin, unittest.TestCase):
-    """
-    Here we also overwrite some of the tests of test_modeling_common.py, as YOLOS does not use input_ids, inputs_embeds,
-    attention_mask and seq_length.
-    """
-
-    all_model_classes = (YolosForObjectDetection, ) if is_mindspore_available() else ()
-    pipeline_model_mapping = (
-        {"image-feature-extraction": YolosModel, "object-detection": YolosForObjectDetection}
-        if is_mindspore_available()
-        else {}
-    )
-
-    test_pruning = False
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_torchscript = False
-
-    # special case for head model
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-        if return_labels:
-            if model_class.__name__ == "YolosForObjectDetection":
-                labels = []
-                for i in range(self.model_tester.batch_size):
-                    target = {}
-                    target["class_labels"] = ops.ones(
-                        (self.model_tester.n_targets,), dtype=mindspore.int64
-                    )
-                    target["boxes"] = ops.ones(
-                        self.model_tester.n_targets, 4, dtype=mindspore.float16
-                    )
-                    labels.append(target)
-                inputs_dict["labels"] = labels
-
-        return inputs_dict
-
-    def setUp(self):
-        self.model_tester = YolosModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=YolosConfig, has_text_modality=False, hidden_size=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_inputs_embeds(self):
-        # YOLOS does not use inputs_embeds
-        pass
-
-    def test_model_get_set_embeddings(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        # in YOLOS, the seq_len is different
-        seq_len = self.model_tester.expected_seq_len
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            self.assertListEqual(
-                list(attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_len, seq_len],
-            )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            self.assertListEqual(
-                list(self_attentions[0].shape[-3:]),
-                [self.model_tester.num_attention_heads, seq_len, seq_len],
-            )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.set_train(False)
-            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            hidden_states = outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            # YOLOS has a different seq_length
-            seq_length = self.model_tester.expected_seq_len
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    def test_for_object_detection(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_object_detection(*config_and_inputs)
-
-    @slow
-    def test_model_from_pretrained(self):
-        model_name = "hustvl/yolos-small"
-        model = YolosModel.from_pretrained(model_name)
-        self.assertIsNotNone(model)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@require_mindspore
-@require_vision
-class YolosModelIntegrationTest(unittest.TestCase):
-    @cached_property
-    def default_image_processor(self):
-        return AutoImageProcessor.from_pretrained("hustvl/yolos-small") if is_vision_available() else None
-
-    @slow
-    def test_inference_object_detection_head(self):
-        model = YolosForObjectDetection.from_pretrained("hustvl/yolos-small")
-
-        image_processor = self.default_image_processor
-        image = prepare_img()
-        inputs = image_processor(images=image, return_tensors="ms")
-
-        # forward pass
-        outputs = model(inputs.pixel_values)
-
-        # verify outputs
-        expected_shape = (1, 100, 92)
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice_logits = mindspore.tensor(
-            [[-23.7219, -10.3165, -14.9083], [-41.5429, -15.2403, -24.1478], [-29.3909, -12.7173, -19.4650]]
-        )
-        expected_slice_boxes = mindspore.tensor(
-            [[0.2536, 0.5449, 0.4643], [0.2037, 0.7735, 0.3672], [0.7692, 0.4056, 0.4549]]
-        )
-        print("outputs.logits[0, :3, :3].asnumpy()**********", outputs.logits[0, :3, :3].asnumpy())
-        self.assertTrue(np.allclose(outputs.logits[0, :3, :3].asnumpy(), expected_slice_logits.asnumpy(), atol=1e-4))
-        self.assertTrue(np.allclose(outputs.pred_boxes[0, :3, :3].asnumpy(), expected_slice_boxes.asnumpy(), atol=1e-4))
-
-        # verify postprocessing
-        results = image_processor.post_process_object_detection(
-            outputs, threshold=0.3, target_sizes=[image.size[::-1]]
-        )[0]
-        expected_scores = mindspore.tensor([0.9991, 0.9801, 0.9978, 0.9875, 0.9848])
-        expected_labels = [75, 75, 17, 63, 17]
-        expected_slice_boxes = mindspore.tensor([331.8438, 80.5440, 369.9546, 188.0579])
-
-        self.assertEqual(len(results["scores"]), 5)
-        self.assertTrue(np.allclose(results["scores"].asnumpy(), expected_scores.asnumpy(), atol=1e-4))
-        self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
-        self.assertTrue(np.allclose(results["boxes"][0, :].asnumpy(), expected_slice_boxes.asnumpy()))
diff --git a/tests/transformers/pipelines/__init__.py b/tests/transformers/pipelines/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/pipelines/test_pipelines_audio_classification.py b/tests/transformers/pipelines/test_pipelines_audio_classification.py
deleted file mode 100644
index 07de77321..000000000
--- a/tests/transformers/pipelines/test_pipelines_audio_classification.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from huggingface_hub import AudioClassificationOutputElement
-
-from mindnlp.transformers import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
-from mindnlp.transformers.pipelines import AudioClassificationPipeline, pipeline
-from mindnlp.utils.testing_utils import (
-    is_pipeline_test,
-    nested_simplify,
-    slow,
-)
-
-from .test_pipelines_common import ANY
-import mindspore
-
-
-@is_pipeline_test
-class AudioClassificationPipelineTests(unittest.TestCase):
-    model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
-
-    def get_test_pipeline(
-        self,
-        model,
-        tokenizer=None,
-        image_processor=None,
-        feature_extractor=None,
-        processor=None,
-        torch_dtype="float32",
-    ):
-        audio_classifier = AudioClassificationPipeline(
-            model=model,
-            tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
-            image_processor=image_processor,
-            processor=processor,
-            torch_dtype=torch_dtype,
-        )
-
-        # test with a raw waveform
-        audio = np.zeros((34000,))
-        audio2 = np.zeros((14000,))
-        return audio_classifier, [audio2, audio]
-
-    def run_pipeline_test(self, audio_classifier, examples):
-        audio2, audio = examples
-        output = audio_classifier(audio)
-        # by default a model is initialized with num_labels=2
-        self.assertEqual(
-            output,
-            [
-                {"score": ANY(float), "label": ANY(str)},
-                {"score": ANY(float), "label": ANY(str)},
-            ],
-        )
-        output = audio_classifier(audio, top_k=1)
-        self.assertEqual(
-            output,
-            [
-                {"score": ANY(float), "label": ANY(str)},
-            ],
-        )
-
-        self.run_msaudio(audio_classifier)
-
-    def run_msaudio(self, audio_classifier):
-        import datasets
-
-        # test with a local file
-        dataset = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        audio = dataset[0]["audio"]["array"]
-        output = audio_classifier(audio)
-        self.assertEqual(
-            output,
-            [
-                {"score": ANY(float), "label": ANY(str)},
-                {"score": ANY(float), "label": ANY(str)},
-            ],
-        )
-
-    def test_small_model_ms(self):
-        model = "anton-l/wav2vec2-random-tiny-classifier"
-
-        audio_classifier = pipeline("audio-classification", model=model)
-
-        audio = np.ones((8000,))
-        output = audio_classifier(audio, top_k=4)
-
-        EXPECTED_OUTPUT = [
-            {"score": 0.0842, "label": "no"},
-            {"score": 0.0838, "label": "up"},
-            {"score": 0.0837, "label": "go"},
-            {"score": 0.0834, "label": "right"},
-        ]
-        EXPECTED_OUTPUT_PT_2 = [
-            {"score": 0.0845, "label": "stop"},
-            {"score": 0.0844, "label": "on"},
-            {"score": 0.0841, "label": "right"},
-            {"score": 0.0834, "label": "left"},
-        ]
-        self.assertIn(nested_simplify(output, decimals=4), [EXPECTED_OUTPUT, EXPECTED_OUTPUT_PT_2])
-
-        audio_dict = {"array": np.ones((8000,)), "sampling_rate": audio_classifier.feature_extractor.sampling_rate}
-        output = audio_classifier(audio_dict, top_k=4)
-        self.assertIn(nested_simplify(output, decimals=4), [EXPECTED_OUTPUT, EXPECTED_OUTPUT_PT_2])
-
-    @slow
-    def test_large_model_ms(self):
-        import datasets
-
-        model = "superb/wav2vec2-base-superb-ks"
-
-        audio_classifier = pipeline("audio-classification", model=model)
-        dataset = datasets.load_dataset("anton-l/superb_dummy", "ks", split="test", trust_remote_code=True)
-
-        audio = np.array(dataset[3]["speech"], dtype=np.float32)
-        output = audio_classifier(audio, top_k=4)
-        self.assertEqual(
-            nested_simplify(output, decimals=3),
-            [
-                {"score": 0.981, "label": "go"},
-                {"score": 0.007, "label": "up"},
-                {"score": 0.006, "label": "_unknown_"},
-                {"score": 0.001, "label": "down"},
-            ],
-        )
diff --git a/tests/transformers/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/transformers/pipelines/test_pipelines_automatic_speech_recognition.py
deleted file mode 100644
index ad4fa72a9..000000000
--- a/tests/transformers/pipelines/test_pipelines_automatic_speech_recognition.py
+++ /dev/null
@@ -1,1954 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-import unittest
-
-import numpy as np
-import pytest
-from huggingface_hub import AutomaticSpeechRecognitionOutput, hf_hub_download, snapshot_download
-from datasets import Audio, load_dataset
-
-from mindnlp.transformers import (
-    MODEL_FOR_CTC_MAPPING,
-    MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
-    AutoFeatureExtractor,
-    AutoModelForCausalLM,
-    AutoModelForSpeechSeq2Seq,
-    AutoProcessor,
-    AutoTokenizer,
-    Speech2TextForConditionalGeneration,
-    Wav2Vec2ForCTC,
-    WhisperForConditionalGeneration,
-)
-from mindnlp.transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline
-from mindnlp.transformers.pipelines.audio_utils import chunk_bytes_iter, ffmpeg_microphone_live
-from mindnlp.transformers.pipelines.automatic_speech_recognition import _find_timestamp_sequence, chunk_iter
-from mindnlp.utils.testing_utils import (
-    is_pipeline_test,
-    is_mindspore_available,
-    nested_simplify,
-    require_pyctcdecode,
-    require_mindspore,
-    require_bfloat16,
-    slow,
-)
-
-from .test_pipelines_common import ANY
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-# We can't use this mixin because it assumes TF support.
-# from .test_pipelines_common import CustomInputPipelineCommonMixin
-
-
-@is_pipeline_test
-class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
-    model_mapping = dict(
-        (list(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.items()) if MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING else [])
-        + (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else [])
-    )
-
-    def get_test_pipeline(
-        self,
-        model,
-        tokenizer=None,
-        image_processor=None,
-        feature_extractor=None,
-        processor=None,
-        ms_dtype="float32",
-    ):
-        if tokenizer is None:
-            # Side effect of no Fast Tokenizer class for these model, so skipping
-            # But the slow tokenizer test should still run as they're quite small
-            self.skipTest(reason="No tokenizer available")
-
-        speech_recognizer = AutomaticSpeechRecognitionPipeline(
-            model=model,
-            tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
-            image_processor=image_processor,
-            processor=processor,
-            ms_dtype=ms_dtype,
-        )
-
-        # test with a raw waveform
-        audio = np.zeros((34000,))
-        audio2 = np.zeros((14000,))
-        return speech_recognizer, [audio, audio2]
-
-    def run_pipeline_test(self, speech_recognizer, examples):
-        audio = np.zeros((34000,))
-        outputs = speech_recognizer(audio)
-        self.assertEqual(outputs, {"text": ANY(str)})
-
-        # Striding
-        audio = {"raw": audio, "stride": (0, 4000), "sampling_rate": speech_recognizer.feature_extractor.sampling_rate}
-        if speech_recognizer.type == "ctc":
-            outputs = speech_recognizer(audio)
-            self.assertEqual(outputs, {"text": ANY(str)})
-        elif "Whisper" in speech_recognizer.model.__class__.__name__:
-            outputs = speech_recognizer(audio)
-            self.assertEqual(outputs, {"text": ANY(str)})
-        else:
-            # Non CTC models cannot use striding.
-            with self.assertRaises(ValueError):
-                outputs = speech_recognizer(audio)
-
-        # Timestamps
-        audio = np.zeros((34000,))
-        if speech_recognizer.type == "ctc":
-            outputs = speech_recognizer(audio, return_timestamps="char")
-            self.assertIsInstance(outputs["chunks"], list)
-            n = len(outputs["chunks"])
-            self.assertEqual(
-                outputs,
-                {
-                    "text": ANY(str),
-                    "chunks": [{"text": ANY(str), "timestamp": (ANY(float), ANY(float))} for i in range(n)],
-                },
-            )
-
-            outputs = speech_recognizer(audio, return_timestamps="word")
-            self.assertIsInstance(outputs["chunks"], list)
-            n = len(outputs["chunks"])
-            self.assertEqual(
-                outputs,
-                {
-                    "text": ANY(str),
-                    "chunks": [{"text": ANY(str), "timestamp": (ANY(float), ANY(float))} for i in range(n)],
-                },
-            )
-        elif "Whisper" in speech_recognizer.model.__class__.__name__:
-            outputs = speech_recognizer(audio, return_timestamps=True)
-            self.assertIsInstance(outputs["chunks"], list)
-            nb_chunks = len(outputs["chunks"])
-            self.assertGreater(nb_chunks, 0)
-            self.assertEqual(
-                outputs,
-                {
-                    "text": ANY(str),
-                    "chunks": [{"text": ANY(str), "timestamp": (ANY(float), ANY(float))} for i in range(nb_chunks)],
-                },
-            )
-        else:
-            # Non CTC models cannot use return_timestamps
-            with self.assertRaisesRegex(
-                ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$"
-            ):
-                outputs = speech_recognizer(audio, return_timestamps="char")
-
-    @require_mindspore
-    @slow
-    def test_ms_defaults(self):
-        pipeline("automatic-speech-recognition")
-
-    @require_mindspore
-    def test_small_model_ms(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="facebook/s2t-small-mustc-en-fr-st",
-            tokenizer="facebook/s2t-small-mustc-en-fr-st",
-        )
-        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
-        output = speech_recognizer(waveform)
-        self.assertEqual(output, {"text": "(Applaudissements)"})
-        output = speech_recognizer(waveform, chunk_length_s=10)
-        self.assertEqual(output, {"text": "(Applaudissements)"})
-
-        # Non CTC models cannot use return_timestamps
-        with self.assertRaisesRegex(
-            ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$"
-        ):
-            _ = speech_recognizer(waveform, return_timestamps="char")
-
-    @require_mindspore
-    def test_small_model_ms_fp16(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="facebook/s2t-small-mustc-en-fr-st",
-            tokenizer="facebook/s2t-small-mustc-en-fr-st",
-            ms_dtype=mindspore.float16,
-        )
-        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
-        output = speech_recognizer(waveform)
-        self.assertEqual(output, {"text": "(Applaudissements)"})
-        output = speech_recognizer(waveform, chunk_length_s=10)
-        self.assertEqual(output, {"text": "(Applaudissements)"})
-
-        # Non CTC models cannot use return_timestamps
-        with self.assertRaisesRegex(
-            ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$"
-        ):
-            _ = speech_recognizer(waveform, return_timestamps="char")
-
-    @require_mindspore
-    @require_bfloat16
-    def test_small_model_ms_bf16(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="facebook/s2t-small-mustc-en-fr-st",
-            tokenizer="facebook/s2t-small-mustc-en-fr-st",
-            ms_dtype=mindspore.bfloat16,
-        )
-        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
-        output = speech_recognizer(waveform)
-        self.assertEqual(output, {"text": "(Applaudissements)"})
-        output = speech_recognizer(waveform, chunk_length_s=10)
-        self.assertEqual(output, {"text": "(Applaudissements)"})
-
-        # Non CTC models cannot use return_timestamps
-        with self.assertRaisesRegex(
-            ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$"
-        ):
-            _ = speech_recognizer(waveform, return_timestamps="char")
-
-    @slow
-    def test_whisper_fp16(self):
-        speech_recognizer = pipeline(
-            model="openai/whisper-base",
-            ms_dtype=mindspore.float16,
-        )
-        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
-        speech_recognizer(waveform)
-
-    @require_mindspore
-    def test_small_model_ms_seq2seq(self):
-        speech_recognizer = pipeline(
-            model="hf-internal-testing/tiny-random-speech-encoder-decoder",
-        )
-
-        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
-        output = speech_recognizer(waveform)
-        self.assertEqual(output, {"text": "あл ش 湯 清 ه ܬ া लᆨしث ल eか u w 全 u"})
-
-    @require_mindspore
-    def test_small_model_ms_seq2seq_gen_kwargs(self):
-        speech_recognizer = pipeline(
-            model="hf-internal-testing/tiny-random-speech-encoder-decoder",
-        )
-
-        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
-        output = speech_recognizer(waveform, max_new_tokens=10, generate_kwargs={"num_beams": 2})
-        self.assertEqual(output, {"text": "あл † γ ت ב オ 束 泣 足"})
-
-    @slow
-    @require_mindspore
-    @require_pyctcdecode
-    def test_large_model_ms_with_lm(self):
-        dataset = load_dataset("Narsil/asr_dummy", streaming=True, trust_remote_code=True)
-        third_item = next(iter(dataset["test"].skip(3)))
-        filename = third_item["file"]
-
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm",
-        )
-        self.assertEqual(speech_recognizer.type, "ctc_with_lm")
-
-        output = speech_recognizer(filename)
-        self.assertEqual(
-            output,
-            {"text": "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumaje"},
-        )
-
-        # Override back to pure CTC
-        speech_recognizer.type = "ctc"
-        output = speech_recognizer(filename)
-        # plumajre != plumaje
-        self.assertEqual(
-            output,
-            {
-                "text": (
-                    "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumajre"
-                )
-            },
-        )
-
-        speech_recognizer.type = "ctc_with_lm"
-        # Simple test with CTC with LM, chunking + timestamps
-        output = speech_recognizer(filename, chunk_length_s=2.0, return_timestamps="word")
-        self.assertEqual(
-            output,
-            {
-                "text": (
-                    "y en las ramas medio sumergidas revoloteaban algunos pájaros de quimérico y legendario plumajcri"
-                ),
-                "chunks": [
-                    {"text": "y", "timestamp": (0.52, 0.54)},
-                    {"text": "en", "timestamp": (0.6, 0.68)},
-                    {"text": "las", "timestamp": (0.74, 0.84)},
-                    {"text": "ramas", "timestamp": (0.94, 1.24)},
-                    {"text": "medio", "timestamp": (1.32, 1.52)},
-                    {"text": "sumergidas", "timestamp": (1.56, 2.22)},
-                    {"text": "revoloteaban", "timestamp": (2.36, 3.0)},
-                    {"text": "algunos", "timestamp": (3.06, 3.38)},
-                    {"text": "pájaros", "timestamp": (3.46, 3.86)},
-                    {"text": "de", "timestamp": (3.92, 4.0)},
-                    {"text": "quimérico", "timestamp": (4.08, 4.6)},
-                    {"text": "y", "timestamp": (4.66, 4.68)},
-                    {"text": "legendario", "timestamp": (4.74, 5.26)},
-                    {"text": "plumajcri", "timestamp": (5.34, 5.74)},
-                ],
-            },
-        )
-        # CTC + LM models cannot use return_timestamps="char"
-        with self.assertRaisesRegex(
-            ValueError, "^CTC with LM can only predict word level timestamps, set `return_timestamps='word'`$"
-        ):
-            _ = speech_recognizer(filename, return_timestamps="char")
-
-    @require_mindspore
-    def test_ms_small_no_tokenizer_files(self):
-        # test that model without tokenizer file cannot be loaded
-        with pytest.raises(OSError):
-            pipeline(
-                task="automatic-speech-recognition",
-                model="patrickvonplaten/tiny-wav2vec2-no-tokenizer",
-            )
-
-    @require_mindspore
-    @slow
-    def test_ms_large(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="facebook/wav2vec2-base-960h",
-            tokenizer="facebook/wav2vec2-base-960h",
-        )
-        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
-        output = speech_recognizer(waveform)
-        self.assertEqual(output, {"text": ""})
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]
-        output = speech_recognizer(audio)
-        self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
-
-    @require_mindspore
-    @slow
-    def test_ms_large_with_input_features(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="hf-audio/wav2vec2-bert-CV16-en",
-        )
-        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
-        output = speech_recognizer(waveform)
-        self.assertEqual(output, {"text": ""})
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]
-        output = speech_recognizer(audio)
-        self.assertEqual(output, {"text": "a man said to the universe sir i exist"})
-
-    @slow
-    @require_mindspore
-    def test_return_timestamps_in_preprocess(self):
-        pipe = pipeline(
-            task="automatic-speech-recognition",
-            model="openai/whisper-tiny",
-            chunk_length_s=8,
-            stride_length_s=1,
-        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
-        sample = next(iter(data))
-
-        res = pipe(sample["audio"]["array"])
-        self.assertEqual(res, {"text": " Conquered returned to its place amidst the tents."})
-
-        res = pipe(sample["audio"]["array"], return_timestamps=True)
-        self.assertEqual(
-            res,
-            {
-                "text": " Conquered returned to its place amidst the tents.",
-                "chunks": [{"timestamp": (0.0, 3.36), "text": " Conquered returned to its place amidst the tents."}],
-            },
-        )
-
-        res = pipe(sample["audio"]["array"], return_timestamps="word")
-        # fmt: off
-        self.assertEqual(
-            res,
-            {
-                'text': ' Conquered returned to its place amidst the tents.',
-                'chunks': [
-                    {'text': ' Conquered', 'timestamp': (0.5, 1.2)},
-                    {'text': ' returned', 'timestamp': (1.2, 1.64)},
-                    {'text': ' to', 'timestamp': (1.64, 1.84)},
-                    {'text': ' its', 'timestamp': (1.84, 2.02)},
-                    {'text': ' place', 'timestamp': (2.02, 2.28)},
-                    {'text': ' amidst', 'timestamp': (2.28, 2.8)},
-                    {'text': ' the', 'timestamp': (2.8, 2.98)},
-                    {'text': ' tents.', 'timestamp': (2.98, 3.48)},
-                ],
-            },
-        )
-        # fmt: on
-
-    @slow
-    @require_mindspore
-    def test_return_timestamps_and_language_in_preprocess(self):
-        pipe = pipeline(
-            task="automatic-speech-recognition",
-            model="openai/whisper-tiny",
-            chunk_length_s=8,
-            stride_length_s=1,
-            return_language=True,
-        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
-        sample = next(iter(data))
-
-        res = pipe(sample["audio"]["array"])
-        self.assertEqual(
-            res,
-            {
-                "text": " Conquered returned to its place amidst the tents.",
-                "chunks": [{"language": "english", "text": " Conquered returned to its place amidst the tents."}],
-            },
-        )
-
-        res = pipe(sample["audio"]["array"], return_timestamps=True)
-        self.assertEqual(
-            res,
-            {
-                "text": " Conquered returned to its place amidst the tents.",
-                "chunks": [
-                    {
-                        "timestamp": (0.0, 3.36),
-                        "language": "english",
-                        "text": " Conquered returned to its place amidst the tents.",
-                    }
-                ],
-            },
-        )
-
-        res = pipe(sample["audio"]["array"], return_timestamps="word")
-        # fmt: off
-        self.assertEqual(
-            res,
-            {
-                'text': ' Conquered returned to its place amidst the tents.',
-                'chunks': [
-                    {"language": "english",'text': ' Conquered', 'timestamp': (0.5, 1.2)},
-                    {"language": "english", 'text': ' returned', 'timestamp': (1.2, 1.64)},
-                    {"language": "english",'text': ' to', 'timestamp': (1.64, 1.84)},
-                    {"language": "english",'text': ' its', 'timestamp': (1.84, 2.02)},
-                    {"language": "english",'text': ' place', 'timestamp': (2.02, 2.28)},
-                    {"language": "english",'text': ' amidst', 'timestamp': (2.28, 2.8)},
-                    {"language": "english",'text': ' the', 'timestamp': (2.8, 2.98)},
-                    {"language": "english",'text': ' tents.', 'timestamp': (2.98, 3.48)},
-                ],
-            },
-        )
-        # fmt: on
-
-    @slow
-    @require_mindspore
-    def test_return_timestamps_in_preprocess_longform(self):
-        pipe = pipeline(
-            task="automatic-speech-recognition",
-            model="openai/whisper-tiny.en",
-        )
-        data = load_dataset("openslr/librispeech_asr", "clean", split="test", streaming=True, trust_remote_code=True)
-        samples = [next(iter(data)) for _ in range(8)]
-        audio = np.concatenate([sample["audio"]["array"] for sample in samples])
-
-        res = pipe(audio)
-        expected_output = {
-            "text": " Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst "
-            "the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst "
-            "the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst "
-            "the tents. Concord returned to its place amidst the tents."
-        }
-        self.assertEqual(res, expected_output)
-        res = pipe(audio, return_timestamps=True)
-        self.assertEqual(
-            res,
-            {
-                "text": " Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents. Concord returned to its place amidst the tents.",
-                "chunks": [
-                    {"timestamp": (0.0, 3.22), "text": " Concord returned to its place amidst the tents."},
-                    {"timestamp": (3.22, 6.74), "text": " Concord returned to its place amidst the tents."},
-                    {"timestamp": (6.74, 10.26), "text": " Concord returned to its place amidst the tents."},
-                    {"timestamp": (10.26, 13.78), "text": " Concord returned to its place amidst the tents."},
-                    {"timestamp": (13.78, 17.3), "text": " Concord returned to its place amidst the tents."},
-                    {"timestamp": (17.3, 20.82), "text": " Concord returned to its place amidst the tents."},
-                    {"timestamp": (20.82, 24.34), "text": " Concord returned to its place amidst the tents."},
-                    {"timestamp": (24.34, 27.86), "text": " Concord returned to its place amidst the tents."},
-                ],
-            },
-        )
-        pipe.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
-        res = pipe(audio, return_timestamps="word")
-
-        # fmt: off
-        self.assertEqual(
-            res["chunks"][:15],
-            [
-                {"text": " Concord", "timestamp": (0.5, 0.94)},
-                {"text": " returned", "timestamp": (0.94, 1.52)},
-                {"text": " to", "timestamp": (1.52, 1.78)},
-                {"text": " its", "timestamp": (1.78, 1.98)},
-                {"text": " place", "timestamp": (1.98, 2.16)},
-                {"text": " amidst", "timestamp": (2.16, 2.5)},
-                {"text": " the", "timestamp": (2.5, 2.9)},
-                {"text": " tents.", "timestamp": (2.9, 4.2)},
-                {"text": " Concord", "timestamp": (4.2, 4.5)},
-                {"text": " returned", "timestamp": (4.5, 5.0)},
-                {"text": " to", "timestamp": (5.0, 5.28)},
-                {"text": " its", "timestamp": (5.28, 5.48)},
-                {"text": " place", "timestamp": (5.48, 5.7)},
-                {"text": " amidst", "timestamp": (5.7, 6.02)},
-                {"text": " the", "timestamp": (6.02, 6.4)}
-
-
-            ],
-        )
-        # fmt: on
-
-    @require_mindspore
-    def test_return_timestamps_in_init(self):
-        # segment-level timestamps are accepted
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-        tokenizer = AutoTokenizer.from_pretrained("openai/whisper-tiny")
-        feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny")
-
-        dummy_speech = np.ones(100)
-
-        pipe = pipeline(
-            task="automatic-speech-recognition",
-            model=model,
-            feature_extractor=feature_extractor,
-            tokenizer=tokenizer,
-            chunk_length_s=8,
-            stride_length_s=1,
-            return_timestamps=True,
-        )
-
-        _ = pipe(dummy_speech)
-
-        # word-level timestamps are accepted
-        pipe = pipeline(
-            task="automatic-speech-recognition",
-            model=model,
-            feature_extractor=feature_extractor,
-            tokenizer=tokenizer,
-            chunk_length_s=8,
-            stride_length_s=1,
-            return_timestamps="word",
-        )
-
-        _ = pipe(dummy_speech)
-
-        # char-level timestamps are not accepted
-        with self.assertRaisesRegex(
-            ValueError,
-            "^Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
-            "Use `return_timestamps='word'` or `return_timestamps=True` respectively.$",
-        ):
-            pipe = pipeline(
-                task="automatic-speech-recognition",
-                model=model,
-                feature_extractor=feature_extractor,
-                tokenizer=tokenizer,
-                chunk_length_s=8,
-                stride_length_s=1,
-                return_timestamps="char",
-            )
-
-            _ = pipe(dummy_speech)
-
-    @require_mindspore
-    @slow
-    def test_ms_whisper(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="openai/whisper-tiny",
-        )
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]
-        output = speech_recognizer(audio)
-        self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."})
-
-        output = speech_recognizer([ds[40]["audio"]], chunk_length_s=5, batch_size=4)
-        self.assertEqual(output, [{"text": " A man said to the universe, Sir, I exist."}])
-
-    @require_mindspore
-    @slow
-    def test_ms_whisper_batched(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="openai/whisper-tiny",
-        )
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:2]")
-        EXPECTED_OUTPUT = [
-            {"text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."},
-            {"text": " Nor is Mr. Quilters' manner less interesting than his matter."},
-        ]
-
-        output = speech_recognizer(ds["audio"], batch_size=2)
-        self.assertEqual(output, EXPECTED_OUTPUT)
-
-    @slow
-    def test_find_longest_common_subsequence(self):
-        max_source_positions = 1500
-        processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
-
-        previous_sequence = [[51492, 406, 3163, 1953, 466, 13, 51612, 51612]]
-        self.assertEqual(
-            processor.decode(previous_sequence[0], output_offsets=True),
-            {
-                "text": " not worth thinking about.",
-                "offsets": [{"text": " not worth thinking about.", "timestamp": (22.56, 24.96)}],
-            },
-        )
-
-        # Merge when the previous sequence is a suffix of the next sequence
-        # fmt: off
-        next_sequences_1 = [
-            [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 50614, 50614, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257]
-        ]
-        # fmt: on
-        self.assertEqual(
-            processor.decode(next_sequences_1[0], output_offsets=True),
-            {
-                "text": (
-                    " of spectators, retrievality is not worth thinking about. His instant panic was followed by a"
-                    " small, sharp blow high on his chest.<|endoftext|>"
-                ),
-                "offsets": [
-                    {"text": " of spectators, retrievality is not worth thinking about.", "timestamp": (0.0, 5.0)},
-                    {
-                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
-                        "timestamp": (5.0, 9.4),
-                    },
-                ],
-            },
-        )
-        merge = _find_timestamp_sequence(
-            [[previous_sequence, (480_000, 0, 0)], [next_sequences_1, (480_000, 120_000, 0)]],
-            processor.tokenizer,
-            processor.feature_extractor,
-            max_source_positions,
-        )
-
-        # fmt: off
-        self.assertEqual(
-            merge,
-            [51492, 406, 3163, 1953, 466, 13, 51739, 51739, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51959],
-        )
-        # fmt: on
-        self.assertEqual(
-            processor.decode(merge, output_offsets=True),
-            {
-                "text": (
-                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
-                    " chest."
-                ),
-                "offsets": [
-                    {"text": " not worth thinking about.", "timestamp": (22.56, 27.5)},
-                    {
-                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
-                        "timestamp": (27.5, 31.900000000000002),
-                    },
-                ],
-            },
-        )
-
-        # Merge when the sequence is in the middle of the 1st next sequence
-        # fmt: off
-        next_sequences_2 = [
-            [50364, 295, 6177, 3391, 11, 19817, 3337, 507, 307, 406, 3163, 1953, 466, 13, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50834, 50257]
-        ]
-        # fmt: on
-        # {'text': ' of spectators, retrievality is not worth thinking about. His instant panic was followed by a small, sharp blow high on his chest.','timestamp': (0.0, 9.4)}
-        merge = _find_timestamp_sequence(
-            [[previous_sequence, (480_000, 0, 0)], [next_sequences_2, (480_000, 120_000, 0)]],
-            processor.tokenizer,
-            processor.feature_extractor,
-            max_source_positions,
-        )
-        # fmt: off
-        self.assertEqual(
-            merge,
-            [51492, 406, 3163, 1953, 466, 13, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51959],
-        )
-        # fmt: on
-        self.assertEqual(
-            processor.decode(merge, output_offsets=True),
-            {
-                "text": (
-                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
-                    " chest."
-                ),
-                "offsets": [
-                    {
-                        "text": (
-                            " not worth thinking about. His instant panic was followed by a small, sharp blow high on"
-                            " his chest."
-                        ),
-                        "timestamp": (22.56, 31.900000000000002),
-                    },
-                ],
-            },
-        )
-
-        # Merge when the previous sequence is not included in the current sequence
-        next_sequences_3 = [[50364, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50584, 50257]]  # fmt: skip
-        # {'text': ' His instant panic was followed by a small, sharp blow high on his chest.','timestamp': (0.0, 9.4)}
-        merge = _find_timestamp_sequence(
-            [[previous_sequence, (480_000, 0, 0)], [next_sequences_3, (480_000, 120_000, 0)]],
-            processor.tokenizer,
-            processor.feature_extractor,
-            max_source_positions,
-        )
-        self.assertEqual(
-            merge,
-            [51492, 406, 3163, 1953, 466, 13, 51612, 51612, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51832],
-        )  # fmt: skip
-        self.assertEqual(
-            processor.decode(merge, output_offsets=True),
-            {
-                "text": (
-                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
-                    " chest."
-                ),
-                "offsets": [
-                    {"text": " not worth thinking about.", "timestamp": (22.56, 24.96)},
-                    {
-                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
-                        "timestamp": (24.96, 29.36),
-                    },
-                ],
-            },
-        )
-        # last case is when the sequence is not in the first next predicted start and end of timestamp
-        next_sequences_3 = [
-            [50364, 2812, 9836, 14783, 390, 406, 3163, 1953, 466, 13, 50634, 50634, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 50934]
-        ]  # fmt: skip
-        merge = _find_timestamp_sequence(
-            [[previous_sequence, (480_000, 0, 0)], [next_sequences_3, (480_000, 167_000, 0)]],
-            processor.tokenizer,
-            processor.feature_extractor,
-            max_source_positions,
-        )
-        self.assertEqual(
-            merge,
-            [51492, 406, 3163, 1953, 466, 13, 51612, 51612, 2812, 9836, 14783, 390, 6263, 538, 257, 1359, 11, 8199, 6327, 1090, 322, 702, 7443, 13, 51912]
-        )  # fmt: skip
-        self.assertEqual(
-            processor.decode(merge, output_offsets=True),
-            {
-                "text": (
-                    " not worth thinking about. His instant panic was followed by a small, sharp blow high on his"
-                    " chest."
-                ),
-                "offsets": [
-                    {"text": " not worth thinking about.", "timestamp": (22.56, 24.96)},
-                    {
-                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
-                        "timestamp": (24.96, 30.96),
-                    },
-                ],
-            },
-        )
-
-    @slow
-    @require_mindspore
-    def test_whisper_timestamp_prediction(self):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        array = np.concatenate(
-            [ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]]
-        )
-        pipe = pipeline(
-            model="openai/whisper-small",
-            return_timestamps=True,
-        )
-
-        output = pipe(ds[40]["audio"])
-        self.assertDictEqual(
-            output,
-            {
-                "text": " A man said to the universe, Sir, I exist.",
-                "chunks": [{"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 4.26)}],
-            },
-        )
-
-        output = pipe(array, chunk_length_s=10)
-        self.assertDictEqual(
-            nested_simplify(output),
-            {
-                "chunks": [
-                    {"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 5.5)},
-                    {
-                        "text": (
-                            " Sweat covered Brion's body, trickling into the "
-                            "tight-loan cloth that was the only garment he wore, the "
-                            "cut"
-                        ),
-                        "timestamp": (5.5, 11.95),
-                    },
-                    {
-                        "text": (
-                            " on his chest still dripping blood, the ache of his "
-                            "overstrained eyes, even the soaring arena around him "
-                            "with"
-                        ),
-                        "timestamp": (11.95, 19.61),
-                    },
-                    {
-                        "text": " the thousands of spectators, retrievality is not worth thinking about.",
-                        "timestamp": (19.61, 25.0),
-                    },
-                    {
-                        "text": " His instant panic was followed by a small, sharp blow high on his chest.",
-                        "timestamp": (25.0, 29.4),
-                    },
-                ],
-                "text": (
-                    " A man said to the universe, Sir, I exist. Sweat covered Brion's "
-                    "body, trickling into the tight-loan cloth that was the only garment "
-                    "he wore, the cut on his chest still dripping blood, the ache of his "
-                    "overstrained eyes, even the soaring arena around him with the "
-                    "thousands of spectators, retrievality is not worth thinking about. "
-                    "His instant panic was followed by a small, sharp blow high on his "
-                    "chest."
-                ),
-            },
-        )
-
-        output = pipe(array)
-        self.assertDictEqual(
-            output,
-            {
-                "chunks": [
-                    {"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 5.5)},
-                    {
-                        "text": (
-                            " Sweat covered Brion's body, trickling into the "
-                            "tight-loan cloth that was the only garment"
-                        ),
-                        "timestamp": (5.5, 10.18),
-                    },
-                    {"text": " he wore.", "timestamp": (10.18, 11.68)},
-                    {"text": " The cut on his chest still dripping blood.", "timestamp": (11.68, 14.92)},
-                    {"text": " The ache of his overstrained eyes.", "timestamp": (14.92, 17.6)},
-                    {
-                        "text": (
-                            " Even the soaring arena around him with the thousands of spectators were trivialities"
-                        ),
-                        "timestamp": (17.6, 22.56),
-                    },
-                    {"text": " not worth thinking about.", "timestamp": (22.56, 24.96)},
-                ],
-                "text": (
-                    " A man said to the universe, Sir, I exist. Sweat covered Brion's "
-                    "body, trickling into the tight-loan cloth that was the only garment "
-                    "he wore. The cut on his chest still dripping blood. The ache of his "
-                    "overstrained eyes. Even the soaring arena around him with the "
-                    "thousands of spectators were trivialities not worth thinking about."
-                ),
-            },
-        )
-
-    @slow
-    @require_mindspore
-    def test_whisper_large_timestamp_prediction(self):
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        array = np.concatenate(
-            [ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]]
-        )
-        pipe = pipeline(model="openai/whisper-large-v3", return_timestamps=True)
-
-        output = pipe(ds[40]["audio"])
-        self.assertDictEqual(
-            output,
-            {
-                "text": " A man said to the universe, Sir, I exist.",
-                "chunks": [{"text": " A man said to the universe, Sir, I exist.", "timestamp": (0.0, 4.08)}],
-            },
-        )
-
-        output = pipe(array, chunk_length_s=10)
-
-        self.assertDictEqual(
-            nested_simplify(output),
-            {
-                "chunks": [
-                    {"timestamp": (0.0, 2.0), "text": (" A man said to the universe,")},
-                    {"timestamp": (2.0, 4.1), "text": (" Sir, I exist.")},
-                    {"timestamp": (5.14, 5.96), "text": (" Sweat covered")},
-                    {"timestamp": (5.96, 8.02), "text": (" Breon's body, trickling into")},
-                    {"timestamp": (8.02, 10.67), "text": (" the tight loincloth that was the only garment he wore,")},
-                    {"timestamp": (10.67, 13.67), "text": (" the cut on his chest still dripping blood,")},
-                    {"timestamp": (13.67, 17.61), "text": (" the ache of his overstrained eyes.")},
-                    {
-                        "timestamp": (17.61, 24.0),
-                        "text": (
-                            " Even the soaring arena around him with thousands of spectators were trivialities not worth thinking about."
-                        ),
-                    },
-                    {
-                        "timestamp": (24.0, 29.94),
-                        "text": (" His instant of panic was followed by a small, sharp blow high on his chest."),
-                    },
-                ],
-                "text": (
-                    " A man said to the universe, Sir, I exist. Sweat covered Breon's"
-                    " body, trickling into the tight loincloth that was the only garment"
-                    " he wore, the cut on his chest still dripping blood, the ache of his"
-                    " overstrained eyes. Even the soaring arena around him with thousands"
-                    " of spectators were trivialities not worth thinking about. His "
-                    "instant of panic was followed by a small, sharp blow high on his chest."
-                ),
-            },
-        )
-
-        output = pipe(array)
-        self.assertDictEqual(
-            output,
-            {
-                "chunks": [
-                    {"timestamp": (0.0, 1.96), "text": " A man said to the universe,"},
-                    {"timestamp": (2.7, 4.1), "text": " Sir, I exist."},
-                    {"timestamp": (5.14, 6.84), "text": " Sweat covered Brion's body,"},
-                    {
-                        "timestamp": (7.4, 10.68),
-                        "text": " trickling into the tight loincloth that was the only garment he wore,",
-                    },
-                    {"timestamp": (11.6, 13.94), "text": " the cut on his chest still dripping blood,"},
-                    {"timestamp": (14.78, 16.72), "text": " the ache of his overstrained eyes,"},
-                    {
-                        "timestamp": (17.32, 21.16),
-                        "text": " even the soaring arena around him with the thousands of spectators",
-                    },
-                    {"timestamp": (21.16, 23.94), "text": " were trivialities not worth thinking about."},
-                    {
-                        "timestamp": (24.42, 29.94),
-                        "text": " His instant panic was followed by a small sharp blow high on his chest.",
-                    },
-                ],
-                "text": (
-                    " A man said to the universe, Sir, I exist. Sweat covered Brion's body,"
-                    " trickling into the tight loincloth that was the only garment he wore, "
-                    "the cut on his chest still dripping blood, the ache of his overstrained "
-                    "eyes, even the soaring arena around him with the thousands of spectators "
-                    "were trivialities not worth thinking about. His instant panic was followed "
-                    "by a small sharp blow high on his chest."
-                ),
-            },
-        )
-
-    @slow
-    @require_mindspore
-    def test_whisper_word_timestamps_batched(self):
-        pipe = pipeline(
-            task="automatic-speech-recognition",
-            model="openai/whisper-tiny",
-            chunk_length_s=3,
-            return_timestamps="word",
-        )
-        data = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        sample = data[0]["audio"]
-
-        # not the same output as test_simple_whisper_asr because of chunking
-        EXPECTED_OUTPUT = {
-            "text": " Mr. Quilder is the apostle of the middle classes and we are glad to welcome his gospel.",
-            "chunks": [
-                {"text": " Mr.", "timestamp": (0.48, 0.96)},
-                {"text": " Quilder", "timestamp": (0.96, 1.24)},
-                {"text": " is", "timestamp": (1.24, 1.5)},
-                {"text": " the", "timestamp": (1.5, 1.72)},
-                {"text": " apostle", "timestamp": (1.72, 1.98)},
-                {"text": " of", "timestamp": (1.98, 2.32)},
-                {"text": " the", "timestamp": (2.32, 2.5)},
-                {"text": " middle", "timestamp": (2.5, 2.68)},
-                {"text": " classes", "timestamp": (2.68, 3.2)},
-                {"text": " and", "timestamp": (3.2, 3.56)},
-                {"text": " we", "timestamp": (3.56, 3.68)},
-                {"text": " are", "timestamp": (3.68, 3.8)},
-                {"text": " glad", "timestamp": (3.8, 4.1)},
-                {"text": " to", "timestamp": (4.1, 4.34)},
-                {"text": " welcome", "timestamp": (4.3, 4.6)},
-                {"text": " his", "timestamp": (4.6, 4.94)},
-                {"text": " gospel.", "timestamp": (4.94, 5.82)},
-            ],
-        }
-
-        # batch size 1: copy the audio sample since pipeline consumes it
-        output = pipe(sample.copy(), batch_size=1)
-        print(output)
-        self.assertDictEqual(output, EXPECTED_OUTPUT)
-
-        # batch size 2: input audio is chunked into smaller pieces so it's testing batching
-        output = pipe(sample, batch_size=2)
-        self.assertDictEqual(output, EXPECTED_OUTPUT)
-
-    @slow
-    @require_mindspore
-    def test_whisper_large_word_timestamps_batched(self):
-        pipe = pipeline(
-            task="automatic-speech-recognition",
-            model="openai/whisper-large-v3",
-            return_timestamps="word",
-        )
-        data = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        sample = data[0]["audio"]
-
-        # not the same output as test_simple_whisper_asr because of chunking
-        EXPECTED_OUTPUT = {
-            "text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.",
-            "chunks": [
-                {"text": " Mr.", "timestamp": (0.0, 0.74)},
-                {"text": " Quilter", "timestamp": (0.74, 1.04)},
-                {"text": " is", "timestamp": (1.04, 1.3)},
-                {"text": " the", "timestamp": (1.3, 1.44)},
-                {"text": " apostle", "timestamp": (1.44, 1.74)},
-                {"text": " of", "timestamp": (1.74, 2.18)},
-                {"text": " the", "timestamp": (2.18, 2.28)},
-                {"text": " middle", "timestamp": (2.28, 2.5)},
-                {"text": " classes,", "timestamp": (2.5, 3.0)},
-                {"text": " and", "timestamp": (3.0, 3.4)},
-                {"text": " we", "timestamp": (3.4, 3.5)},
-                {"text": " are", "timestamp": (3.5, 3.6)},
-                {"text": " glad", "timestamp": (3.6, 3.84)},
-                {"text": " to", "timestamp": (3.84, 4.1)},
-                {"text": " welcome", "timestamp": (4.1, 4.4)},
-                {"text": " his", "timestamp": (4.4, 4.7)},
-                {"text": " gospel.", "timestamp": (4.7, 5.34)},
-            ],
-        }
-
-        # batch size 1: copy the audio sample since pipeline consumes it
-        output = pipe(sample.copy(), batch_size=1)
-        self.assertDictEqual(output, EXPECTED_OUTPUT)
-
-        # batch size 2: input audio is chunked into smaller pieces so it's testing batching
-        output = pipe(sample, batch_size=2)
-        self.assertDictEqual(output, EXPECTED_OUTPUT)
-
-    @require_mindspore
-    @slow
-    def test_ms_speech_encoder_decoder(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="facebook/s2t-wav2vec2-large-en-de",
-            feature_extractor="facebook/s2t-wav2vec2-large-en-de",
-        )
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]
-        output = speech_recognizer(audio)
-        self.assertEqual(output, {"text": 'Ein Mann sagte zum Universum : " Sir, ich existiert! "'})
-
-    @slow
-    @require_mindspore
-    def test_simple_wav2vec2(self):
-        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
-        tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
-
-        asr = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
-        output = asr(waveform)
-        self.assertEqual(output, {"text": ""})
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]
-        output = asr(audio)
-        self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
-
-        data = Audio().encode_example(ds[40]["audio"])["bytes"]
-        output = asr(data)
-        self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
-
-    @slow
-    @require_mindspore
-    def test_simple_s2t(self):
-        model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-mustc-en-it-st")
-        tokenizer = AutoTokenizer.from_pretrained("facebook/s2t-small-mustc-en-it-st")
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/s2t-small-mustc-en-it-st")
-
-        asr = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
-
-        waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
-
-        output = asr(waveform)
-        self.assertEqual(output, {"text": "(Applausi)"})
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]
-        output = asr(audio)
-        self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
-
-        data = Audio().encode_example(ds[40]["audio"])["bytes"]
-        output = asr(data)
-        self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
-
-    @slow
-    @require_mindspore
-    def test_simple_whisper_asr(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="openai/whisper-tiny.en",
-        )
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        audio = ds[0]["audio"]
-        output = speech_recognizer(audio)
-        self.assertEqual(
-            output,
-            {"text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."},
-        )
-        output = speech_recognizer(ds[0]["audio"], return_timestamps=True)
-        self.assertEqual(
-            output,
-            {
-                "text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.",
-                "chunks": [
-                    {
-                        "text": (
-                            " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
-                        ),
-                        "timestamp": (0.0, 5.44),
-                    }
-                ],
-            },
-        )
-        speech_recognizer.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
-        output = speech_recognizer(ds[0]["audio"], return_timestamps="word")
-        # fmt: off
-        self.assertEqual(
-            output,
-            {
-                'text': ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.',
-                'chunks': [
-                    {'text': ' Mr.', 'timestamp': (0.38, 1.04)},
-                    {'text': ' Quilter', 'timestamp': (1.04, 1.18)},
-                    {'text': ' is', 'timestamp': (1.18, 1.44)},
-                    {'text': ' the', 'timestamp': (1.44, 1.58)},
-                    {'text': ' apostle', 'timestamp': (1.58, 1.98)},
-                    {'text': ' of', 'timestamp': (1.98, 2.32)},
-                    {'text': ' the', 'timestamp': (2.32, 2.46)},
-                    {'text': ' middle', 'timestamp': (2.46, 2.56)},
-                    {'text': ' classes,', 'timestamp': (2.56, 3.4)},
-                    {'text': ' and', 'timestamp': (3.4, 3.54)},
-                    {'text': ' we', 'timestamp': (3.54, 3.62)},
-                    {'text': ' are', 'timestamp': (3.62, 3.72)},
-                    {'text': ' glad', 'timestamp': (3.72, 4.0)},
-                    {'text': ' to', 'timestamp': (4.0, 4.26)},
-                    {'text': ' welcome', 'timestamp': (4.26, 4.56)},
-                    {'text': ' his', 'timestamp': (4.56, 4.92)},
-                    {'text': ' gospel.', 'timestamp': (4.92, 5.84)}
-                ]
-            }
-        )
-        # fmt: on
-
-        # Whisper can only predict segment level timestamps or word level, not character level
-        with self.assertRaisesRegex(
-            ValueError,
-            "^Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
-            "Use `return_timestamps='word'` or `return_timestamps=True` respectively.$",
-        ):
-            _ = speech_recognizer(audio, return_timestamps="char")
-
-    @slow
-    @require_mindspore
-    def test_simple_whisper_translation(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="openai/whisper-large",
-        )
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]
-        output = speech_recognizer(audio)
-        self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."})
-
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
-        tokenizer = AutoTokenizer.from_pretrained("openai/whisper-large")
-        feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-large")
-
-        speech_recognizer_2 = AutomaticSpeechRecognitionPipeline(
-            model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
-        )
-        output_2 = speech_recognizer_2(ds[40]["audio"])
-        self.assertEqual(output, output_2)
-
-        # either use generate_kwargs or set the model's generation_config
-        # model.generation_config.task = "transcribe"
-        # model.generation_config.lang = "<|it|>"
-        speech_translator = AutomaticSpeechRecognitionPipeline(
-            model=model,
-            tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
-            generate_kwargs={"task": "transcribe", "language": "<|it|>"},
-        )
-        output_3 = speech_translator(ds[40]["audio"])
-        self.assertEqual(output_3, {"text": " Un uomo ha detto all'universo, Sir, esiste."})
-
-    @slow
-    @require_mindspore
-    def test_whisper_language(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="openai/whisper-tiny.en",
-        )
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        audio = ds[0]["audio"]
-
-        # 1. English-only model compatible with no language argument
-        output = speech_recognizer(audio)
-        self.assertEqual(
-            output,
-            {"text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."},
-        )
-
-        # 2. English-only Whisper does not accept the language argument
-        with self.assertRaisesRegex(
-            ValueError,
-            "Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, "
-            "pass `is_multilingual=True` to generate, or update the generation config.",
-        ):
-            _ = speech_recognizer(ds[0]["audio"], generate_kwargs={"language": "en"})
-
-        # 3. Multilingual model accepts language argument
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="openai/whisper-tiny",
-        )
-        output = speech_recognizer(ds[0]["audio"], generate_kwargs={"language": "en"})
-        self.assertEqual(
-            output,
-            {"text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."},
-        )
-
-    @slow
-    def test_speculative_decoding_whisper_non_distil(self):
-        # Load data:
-        dataset = load_dataset(
-            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True
-        )
-        sample = dataset[0]["audio"]
-
-        # Load model:
-        model_id = "openai/whisper-large-v2"
-        processor = AutoProcessor.from_pretrained(model_id)
-        model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            model_id,
-            use_safetensors=True,
-        )
-
-        # Load assistant:
-        assistant_model_id = "openai/whisper-tiny"
-        assistant_model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            assistant_model_id,
-            use_safetensors=True,
-        )
-
-        # Load pipeline:
-        pipe = AutomaticSpeechRecognitionPipeline(
-            model=model,
-            tokenizer=processor.tokenizer,
-            feature_extractor=processor.feature_extractor,
-            generate_kwargs={"language": "en"},
-        )
-
-        start_time = time.time()
-        transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"]
-        total_time_assist = time.time() - start_time
-
-        start_time = time.time()
-        transcription_ass = pipe(sample)["text"]
-        total_time_non_assist = time.time() - start_time
-
-        self.assertEqual(transcription_ass, transcription_non_ass)
-        self.assertEqual(
-            transcription_ass,
-            " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.",
-        )
-        self.assertTrue(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster")
-
-    @slow
-    def test_speculative_decoding_whisper_distil(self):
-        # Load data:
-        dataset = load_dataset(
-            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]", trust_remote_code=True
-        )
-        sample = dataset[0]["audio"]
-
-        # Load model:
-        model_id = "openai/whisper-large-v2"
-        processor = AutoProcessor.from_pretrained(model_id)
-        model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            model_id,
-            use_safetensors=True,
-        )
-
-        # Load assistant:
-        assistant_model_id = "distil-whisper/distil-large-v2"
-        assistant_model = AutoModelForCausalLM.from_pretrained(
-            assistant_model_id,
-            use_safetensors=True,
-        )
-
-        # Load pipeline:
-        pipe = AutomaticSpeechRecognitionPipeline(
-            model=model,
-            tokenizer=processor.tokenizer,
-            feature_extractor=processor.feature_extractor,
-            generate_kwargs={"language": "en"},
-        )
-
-        start_time = time.time()
-        transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"]
-        total_time_assist = time.time() - start_time
-
-        start_time = time.time()
-        transcription_ass = pipe(sample)["text"]
-        total_time_non_assist = time.time() - start_time
-
-        self.assertEqual(transcription_ass, transcription_non_ass)
-        self.assertEqual(
-            transcription_ass,
-            " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.",
-        )
-        self.assertEqual(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster")
-
-    @slow
-    @require_mindspore
-    def test_xls_r_to_en(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="facebook/wav2vec2-xls-r-1b-21-to-en",
-            feature_extractor="facebook/wav2vec2-xls-r-1b-21-to-en",
-        )
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]
-        output = speech_recognizer(audio)
-        self.assertEqual(output, {"text": "A man said to the universe: “Sir, I exist."})
-
-    @slow
-    @require_mindspore
-    def test_xls_r_from_en(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="facebook/wav2vec2-xls-r-1b-en-to-15",
-            feature_extractor="facebook/wav2vec2-xls-r-1b-en-to-15",
-        )
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]
-        output = speech_recognizer(audio)
-        self.assertEqual(output, {"text": "Ein Mann sagte zu dem Universum, Sir, ich bin da."})
-
-    @slow
-    @require_mindspore
-    def test_speech_to_text_leveraged(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="patrickvonplaten/wav2vec2-2-bart-base",
-            feature_extractor="patrickvonplaten/wav2vec2-2-bart-base",
-            tokenizer=AutoTokenizer.from_pretrained("patrickvonplaten/wav2vec2-2-bart-base"),
-        )
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]
-        output = speech_recognizer(audio)
-        self.assertEqual(output, {"text": "a man said to the universe sir i exist"})
-
-    @slow
-    def test_wav2vec2_conformer_float16(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="facebook/wav2vec2-conformer-rope-large-960h-ft",
-            ms_dtype=mindspore.float16,
-        )
-
-        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        sample = dataset[0]["audio"]
-
-        output = speech_recognizer(sample)
-        self.assertEqual(
-            output,
-            {"text": "MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL"},
-        )
-
-    @require_mindspore
-    def test_chunking_fast(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="hf-internal-testing/tiny-random-wav2vec2",
-            chunk_length_s=10.0,
-        )
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]["array"]
-
-        n_repeats = 2
-        audio_tiled = np.tile(audio, n_repeats)
-        output = speech_recognizer([audio_tiled], batch_size=2)
-        self.assertEqual(output, [{"text": ANY(str)}])
-        self.assertEqual(output[0]["text"][:6], "ZBT ZC")
-
-    @require_mindspore
-    def test_return_timestamps_ctc_fast(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="hf-internal-testing/tiny-random-wav2vec2",
-        )
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        # Take short audio to keep the test readable
-        audio = ds[40]["audio"]["array"][:800]
-
-        output = speech_recognizer(audio, return_timestamps="char")
-        self.assertEqual(
-            output,
-            {
-                "text": "ZBT ZX G",
-                "chunks": [
-                    {"text": " ", "timestamp": (0.0, 0.012)},
-                    {"text": "Z", "timestamp": (0.012, 0.016)},
-                    {"text": "B", "timestamp": (0.016, 0.02)},
-                    {"text": "T", "timestamp": (0.02, 0.024)},
-                    {"text": " ", "timestamp": (0.024, 0.028)},
-                    {"text": "Z", "timestamp": (0.028, 0.032)},
-                    {"text": "X", "timestamp": (0.032, 0.036)},
-                    {"text": " ", "timestamp": (0.036, 0.04)},
-                    {"text": "G", "timestamp": (0.04, 0.044)},
-                ],
-            },
-        )
-
-        output = speech_recognizer(audio, return_timestamps="word")
-        self.assertEqual(
-            output,
-            {
-                "text": "ZBT ZX G",
-                "chunks": [
-                    {"text": "ZBT", "timestamp": (0.012, 0.024)},
-                    {"text": "ZX", "timestamp": (0.028, 0.036)},
-                    {"text": "G", "timestamp": (0.04, 0.044)},
-                ],
-            },
-        )
-
-    @require_mindspore
-    @require_pyctcdecode
-    def test_chunking_fast_with_lm(self):
-        speech_recognizer = pipeline(
-            model="hf-internal-testing/processor_with_lm",
-            chunk_length_s=10.0,
-        )
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]["array"]
-
-        n_repeats = 2
-        audio_tiled = np.tile(audio, n_repeats)
-        # Batch_size = 1
-        output1 = speech_recognizer([audio_tiled], batch_size=1)
-        self.assertEqual(output1, [{"text": ANY(str)}])
-        self.assertEqual(output1[0]["text"][:6], "<s> <s")
-
-        # batch_size = 2
-        output2 = speech_recognizer([audio_tiled], batch_size=2)
-        self.assertEqual(output2, [{"text": ANY(str)}])
-        self.assertEqual(output2[0]["text"][:6], "<s> <s")
-
-        # TODO There is an offby one error because of the ratio.
-        # Maybe logits get affected by the padding on this random
-        # model is more likely. Add some masking ?
-        # self.assertEqual(output1, output2)
-
-    @require_mindspore
-    @require_pyctcdecode
-    def test_with_lm_fast(self):
-        speech_recognizer = pipeline(
-            model="hf-internal-testing/processor_with_lm",
-        )
-        self.assertEqual(speech_recognizer.type, "ctc_with_lm")
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]["array"]
-
-        n_repeats = 2
-        audio_tiled = np.tile(audio, n_repeats)
-
-        output = speech_recognizer([audio_tiled], batch_size=2)
-        self.assertEqual(output, [{"text": ANY(str)}])
-        self.assertEqual(output[0]["text"][:6], "<s> <s")
-
-        # Making sure the argument are passed to the decoder
-        # Since no change happens in the result, check the error comes from
-        # the `decode_beams` function.
-        with self.assertRaises(TypeError) as e:
-            output = speech_recognizer([audio_tiled], decoder_kwargs={"num_beams": 2})
-            self.assertContains(e.msg, "TypeError: decode_beams() got an unexpected keyword argument 'num_beams'")
-        output = speech_recognizer([audio_tiled], decoder_kwargs={"beam_width": 2})
-
-    @require_mindspore
-    @require_pyctcdecode
-    def test_with_local_lm_fast(self):
-        local_dir = snapshot_download("hf-internal-testing/processor_with_lm")
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model=local_dir,
-        )
-        self.assertEqual(speech_recognizer.type, "ctc_with_lm")
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]["array"]
-
-        n_repeats = 2
-        audio_tiled = np.tile(audio, n_repeats)
-
-        output = speech_recognizer([audio_tiled], batch_size=2)
-
-        self.assertEqual(output, [{"text": ANY(str)}])
-        self.assertEqual(output[0]["text"][:6], "<s> <s")
-
-    @require_mindspore
-    @slow
-    def test_whisper_prompted(self):
-        processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
-
-        pipe = pipeline(
-            "automatic-speech-recognition",
-            model=model,
-            tokenizer=processor.tokenizer,
-            feature_extractor=processor.feature_extractor,
-            max_new_tokens=128,
-            chunk_length_s=30,
-            batch_size=16,
-        )
-
-        dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
-        sample = dataset[0]["audio"]
-
-        # prompt the model to misspell "Mr Quilter" as "Mr Quillter"
-        whisper_prompt = "Mr. Quillter."
-        prompt_ids = pipe.tokenizer.get_prompt_ids(whisper_prompt, return_tensors="ms")
-
-        unprompted_result = pipe(sample.copy())["text"]
-        prompted_result = pipe(sample, generate_kwargs={"prompt_ids": prompt_ids})["text"]
-
-        # fmt: off
-        EXPECTED_UNPROMPTED_RESULT = " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins work is really Greek after all and can discover in it but little of rocky Ithaca. Lennils, pictures are a sort of upguards and atom paintings and Mason's exquisite itals are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampoo or a Turkish bath. Next man"
-        EXPECTED_PROMPTED_RESULT = " Mr. Quillter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quillter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins work is really great after all, and can discover in it but little of rocky Ithaca. Lennils, pictures are a sort of upguards and atom paintings, and Mason's exquisite itals are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampoo or a Turkish bath. Next man."
-        # fmt: on
-
-        self.assertEqual(unprompted_result, EXPECTED_UNPROMPTED_RESULT)
-        self.assertEqual(prompted_result, EXPECTED_PROMPTED_RESULT)
-
-    @require_mindspore
-    @slow
-    def test_whisper_longform(self):
-        # fmt: off
-        EXPECTED_RESULT = " Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on Saturday, Rusty Cargo, container down by the Wharf, and challenge toothless drifters to the godless bughouse lets of tournament that is my segment. MUSIC Meanwhile!"
-        # fmt: on
-
-        processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-
-        pipe = pipeline(
-            "automatic-speech-recognition",
-            model=model,
-            tokenizer=processor.tokenizer,
-            feature_extractor=processor.feature_extractor,
-            max_new_tokens=128,
-            return_timestamps=True,  # to allow longform generation
-        )
-
-        ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
-        ds = ds.cast_column("audio", Audio(sampling_rate=16000))
-        audio = ds[:1]["audio"]
-
-        result = pipe(audio)[0]["text"]
-
-        assert result == EXPECTED_RESULT
-
-    @require_mindspore
-    @slow
-    def test_seamless_v2(self):
-        pipe = pipeline(
-            "automatic-speech-recognition",
-            model="facebook/seamless-m4t-v2-large",
-        )
-
-        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        sample = dataset[0]["audio"]
-
-        result = pipe(sample, generate_kwargs={"tgt_lang": "eng"})
-        EXPECTED_RESULT = "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"
-
-        assert result["text"] == EXPECTED_RESULT
-
-    @require_mindspore
-    @slow
-    def test_chunking_and_timestamps(self):
-        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
-        tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model=model,
-            tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
-            chunk_length_s=10.0,
-        )
-
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]["array"]
-
-        n_repeats = 10
-        audio_tiled = np.tile(audio, n_repeats)
-        output = speech_recognizer([audio_tiled], batch_size=2)
-        self.assertEqual(output, [{"text": ("A MAN SAID TO THE UNIVERSE SIR I EXIST " * n_repeats).strip()}])
-
-        output = speech_recognizer(audio, return_timestamps="char")
-        self.assertEqual(audio.shape, (74_400,))
-        self.assertEqual(speech_recognizer.feature_extractor.sampling_rate, 16_000)
-        # The audio is 74_400 / 16_000 = 4.65s long.
-        self.assertEqual(
-            output,
-            {
-                "text": "A MAN SAID TO THE UNIVERSE SIR I EXIST",
-                "chunks": [
-                    {"text": "A", "timestamp": (0.6, 0.62)},
-                    {"text": " ", "timestamp": (0.62, 0.66)},
-                    {"text": "M", "timestamp": (0.68, 0.7)},
-                    {"text": "A", "timestamp": (0.78, 0.8)},
-                    {"text": "N", "timestamp": (0.84, 0.86)},
-                    {"text": " ", "timestamp": (0.92, 0.98)},
-                    {"text": "S", "timestamp": (1.06, 1.08)},
-                    {"text": "A", "timestamp": (1.14, 1.16)},
-                    {"text": "I", "timestamp": (1.16, 1.18)},
-                    {"text": "D", "timestamp": (1.2, 1.24)},
-                    {"text": " ", "timestamp": (1.24, 1.28)},
-                    {"text": "T", "timestamp": (1.28, 1.32)},
-                    {"text": "O", "timestamp": (1.34, 1.36)},
-                    {"text": " ", "timestamp": (1.38, 1.42)},
-                    {"text": "T", "timestamp": (1.42, 1.44)},
-                    {"text": "H", "timestamp": (1.44, 1.46)},
-                    {"text": "E", "timestamp": (1.46, 1.5)},
-                    {"text": " ", "timestamp": (1.5, 1.56)},
-                    {"text": "U", "timestamp": (1.58, 1.62)},
-                    {"text": "N", "timestamp": (1.64, 1.68)},
-                    {"text": "I", "timestamp": (1.7, 1.72)},
-                    {"text": "V", "timestamp": (1.76, 1.78)},
-                    {"text": "E", "timestamp": (1.84, 1.86)},
-                    {"text": "R", "timestamp": (1.86, 1.9)},
-                    {"text": "S", "timestamp": (1.96, 1.98)},
-                    {"text": "E", "timestamp": (1.98, 2.02)},
-                    {"text": " ", "timestamp": (2.02, 2.06)},
-                    {"text": "S", "timestamp": (2.82, 2.86)},
-                    {"text": "I", "timestamp": (2.94, 2.96)},
-                    {"text": "R", "timestamp": (2.98, 3.02)},
-                    {"text": " ", "timestamp": (3.06, 3.12)},
-                    {"text": "I", "timestamp": (3.5, 3.52)},
-                    {"text": " ", "timestamp": (3.58, 3.6)},
-                    {"text": "E", "timestamp": (3.66, 3.68)},
-                    {"text": "X", "timestamp": (3.68, 3.7)},
-                    {"text": "I", "timestamp": (3.9, 3.92)},
-                    {"text": "S", "timestamp": (3.94, 3.96)},
-                    {"text": "T", "timestamp": (4.0, 4.02)},
-                    {"text": " ", "timestamp": (4.06, 4.1)},
-                ],
-            },
-        )
-        output = speech_recognizer(audio, return_timestamps="word")
-        self.assertEqual(
-            output,
-            {
-                "text": "A MAN SAID TO THE UNIVERSE SIR I EXIST",
-                "chunks": [
-                    {"text": "A", "timestamp": (0.6, 0.62)},
-                    {"text": "MAN", "timestamp": (0.68, 0.86)},
-                    {"text": "SAID", "timestamp": (1.06, 1.24)},
-                    {"text": "TO", "timestamp": (1.28, 1.36)},
-                    {"text": "THE", "timestamp": (1.42, 1.5)},
-                    {"text": "UNIVERSE", "timestamp": (1.58, 2.02)},
-                    {"text": "SIR", "timestamp": (2.82, 3.02)},
-                    {"text": "I", "timestamp": (3.5, 3.52)},
-                    {"text": "EXIST", "timestamp": (3.66, 4.02)},
-                ],
-            },
-        )
-        output = speech_recognizer(audio, return_timestamps="word", chunk_length_s=2.0)
-        self.assertEqual(
-            output,
-            {
-                "text": "A MAN SAID TO THE UNIVERSE SIR I EXIST",
-                "chunks": [
-                    {"text": "A", "timestamp": (0.6, 0.62)},
-                    {"text": "MAN", "timestamp": (0.68, 0.86)},
-                    {"text": "SAID", "timestamp": (1.06, 1.24)},
-                    {"text": "TO", "timestamp": (1.3, 1.36)},
-                    {"text": "THE", "timestamp": (1.42, 1.48)},
-                    {"text": "UNIVERSE", "timestamp": (1.58, 2.02)},
-                    # Tiny change linked to chunking.
-                    {"text": "SIR", "timestamp": (2.84, 3.02)},
-                    {"text": "I", "timestamp": (3.5, 3.52)},
-                    {"text": "EXIST", "timestamp": (3.66, 4.02)},
-                ],
-            },
-        )
-        # CTC models must specify return_timestamps type - cannot set `return_timestamps=True` blindly
-        with self.assertRaisesRegex(
-            ValueError,
-            "^CTC can either predict character level timestamps, or word level timestamps. "
-            "Set `return_timestamps='char'` or `return_timestamps='word'` as required.$",
-        ):
-            _ = speech_recognizer(audio, return_timestamps=True)
-
-    @require_mindspore
-    @slow
-    def test_chunking_with_lm(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="patrickvonplaten/wav2vec2-base-100h-with-lm",
-            chunk_length_s=10.0,
-        )
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        audio = ds[40]["audio"]["array"]
-
-        n_repeats = 10
-        audio = np.tile(audio, n_repeats)
-        output = speech_recognizer([audio], batch_size=2)
-        expected_text = "A MAN SAID TO THE UNIVERSE SIR I EXIST " * n_repeats
-        expected = [{"text": expected_text.strip()}]
-        self.assertEqual(output, expected)
-
-    @require_mindspore
-    def test_chunk_iterator(self):
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
-        inputs = ops.arange(100).long()
-        outs = list(chunk_iter(inputs, feature_extractor, 100, 0, 0))
-
-        self.assertEqual(len(outs), 1)
-        self.assertEqual([o["stride"] for o in outs], [(100, 0, 0)])
-        self.assertEqual([o["input_values"].shape for o in outs], [(1, 100)])
-        self.assertEqual([o["is_last"] for o in outs], [True])
-
-        # two chunks no stride
-        outs = list(chunk_iter(inputs, feature_extractor, 50, 0, 0))
-        self.assertEqual(len(outs), 2)
-        self.assertEqual([o["stride"] for o in outs], [(50, 0, 0), (50, 0, 0)])
-        self.assertEqual([o["input_values"].shape for o in outs], [(1, 50), (1, 50)])
-        self.assertEqual([o["is_last"] for o in outs], [False, True])
-
-        # two chunks incomplete last
-        outs = list(chunk_iter(inputs, feature_extractor, 80, 0, 0))
-        self.assertEqual(len(outs), 2)
-        self.assertEqual([o["stride"] for o in outs], [(80, 0, 0), (20, 0, 0)])
-        self.assertEqual([o["input_values"].shape for o in outs], [(1, 80), (1, 20)])
-        self.assertEqual([o["is_last"] for o in outs], [False, True])
-
-        # one chunk since first is also last, because it contains only data
-        # in the right strided part we just mark that part as non stride
-        # This test is specifically crafted to trigger a bug if next chunk
-        # would be ignored by the fact that all the data would be
-        # contained in the strided left data.
-        outs = list(chunk_iter(inputs, feature_extractor, 105, 5, 5))
-        self.assertEqual(len(outs), 1)
-        self.assertEqual([o["stride"] for o in outs], [(100, 0, 0)])
-        self.assertEqual([o["input_values"].shape for o in outs], [(1, 100)])
-        self.assertEqual([o["is_last"] for o in outs], [True])
-
-    @require_mindspore
-    def test_chunk_iterator_stride(self):
-        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
-        inputs = ops.arange(100).long()
-        input_values = feature_extractor(inputs, sampling_rate=feature_extractor.sampling_rate, return_tensors="ms")[
-            "input_values"
-        ]
-        outs = list(chunk_iter(inputs, feature_extractor, 100, 20, 10))
-        self.assertEqual(len(outs), 1)
-        self.assertEqual([o["stride"] for o in outs], [(100, 0, 0)])
-        self.assertEqual([o["input_values"].shape for o in outs], [(1, 100)])
-        self.assertEqual([o["is_last"] for o in outs], [True])
-
-        outs = list(chunk_iter(inputs, feature_extractor, 80, 20, 10))
-        self.assertEqual(len(outs), 2)
-        self.assertEqual([o["stride"] for o in outs], [(80, 0, 10), (50, 20, 0)])
-        self.assertEqual([o["input_values"].shape for o in outs], [(1, 80), (1, 50)])
-        self.assertEqual([o["is_last"] for o in outs], [False, True])
-
-        outs = list(chunk_iter(inputs, feature_extractor, 90, 20, 0))
-        self.assertEqual(len(outs), 2)
-        self.assertEqual([o["stride"] for o in outs], [(90, 0, 0), (30, 20, 0)])
-        self.assertEqual([o["input_values"].shape for o in outs], [(1, 90), (1, 30)])
-
-        outs = list(chunk_iter(inputs, feature_extractor, 36, 6, 6))
-        self.assertEqual(len(outs), 4)
-        self.assertEqual([o["stride"] for o in outs], [(36, 0, 6), (36, 6, 6), (36, 6, 6), (28, 6, 0)])
-        self.assertEqual([o["input_values"].shape for o in outs], [(1, 36), (1, 36), (1, 36), (1, 28)])
-
-        inputs = mindspore.Tensor([i % 2 for i in range(100)])
-        input_values = feature_extractor(inputs, sampling_rate=feature_extractor.sampling_rate, return_tensors="ms")[
-            "input_values"
-        ]
-        outs = list(chunk_iter(inputs, feature_extractor, 30, 5, 5))
-        self.assertEqual(len(outs), 5)
-        self.assertEqual([o["stride"] for o in outs], [(30, 0, 5), (30, 5, 5), (30, 5, 5), (30, 5, 5), (20, 5, 0)])
-        self.assertEqual([o["input_values"].shape for o in outs], [(1, 30), (1, 30), (1, 30), (1, 30), (1, 20)])
-        self.assertEqual([o["is_last"] for o in outs], [False, False, False, False, True])
-        # (0, 25)
-        self.assertEqual(nested_simplify(input_values[:, :30]), nested_simplify(outs[0]["input_values"]))
-        # (25, 45)
-        self.assertEqual(nested_simplify(input_values[:, 20:50]), nested_simplify(outs[1]["input_values"]))
-        # (45, 65)
-        self.assertEqual(nested_simplify(input_values[:, 40:70]), nested_simplify(outs[2]["input_values"]))
-        # (65, 85)
-        self.assertEqual(nested_simplify(input_values[:, 60:90]), nested_simplify(outs[3]["input_values"]))
-        # (85, 100)
-        self.assertEqual(nested_simplify(input_values[:, 80:100]), nested_simplify(outs[4]["input_values"]))
-
-    @require_mindspore
-    def test_stride(self):
-        speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model="hf-internal-testing/tiny-random-wav2vec2",
-        )
-        waveform = np.tile(np.arange(1000, dtype=np.float32), 10)
-        output = speech_recognizer({"raw": waveform, "stride": (0, 0), "sampling_rate": 16_000})
-        self.assertEqual(output, {"text": "OB XB  B EB BB  B EB B OB X"})
-
-        # 0 effective ids Just take the middle one
-        output = speech_recognizer({"raw": waveform, "stride": (5000, 5000), "sampling_rate": 16_000})
-        self.assertEqual(output, {"text": ""})
-
-        # Only 1 arange.
-        output = speech_recognizer({"raw": waveform, "stride": (0, 9000), "sampling_rate": 16_000})
-        self.assertEqual(output, {"text": "OB"})
-
-        # 2nd arange
-        output = speech_recognizer({"raw": waveform, "stride": (1000, 8000), "sampling_rate": 16_000})
-        self.assertEqual(output, {"text": "XB"})
-
-    @slow
-    def test_slow_unfinished_sequence(self):
-        from mindnlp.transformers import GenerationConfig
-
-        pipe = pipeline(
-            "automatic-speech-recognition",
-            model="vasista22/whisper-hindi-large-v2",
-        )
-
-        # the audio is 4 seconds long
-        audio = hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
-
-        # Original model wasn't trained with timestamps and has incorrect generation config
-        out = pipe(
-            audio,
-            return_timestamps=True,
-            generate_kwargs={"generation_config": GenerationConfig.from_pretrained("openai/whisper-large-v2")},
-        )
-        self.assertEqual(
-            out,
-            {
-                "text": "मिर्ची में कितने विभिन्न प्रजातियां हैं",
-                "chunks": [{"timestamp": (0.58, None), "text": "मिर्ची में कितने विभिन्न प्रजातियां हैं"}],
-            },
-        )
-
-
-def require_ffmpeg(test_case):
-    """
-    Decorator marking a test that requires FFmpeg.
-
-    These tests are skipped when FFmpeg isn't installed.
-
-    """
-    import subprocess
-
-    try:
-        subprocess.check_output(["ffmpeg", "-h"], stderr=subprocess.DEVNULL)
-        return test_case
-    except Exception:
-        return unittest.skip(reason="test requires ffmpeg")(test_case)
-
-
-def bytes_iter(chunk_size, chunks):
-    for i in range(chunks):
-        yield bytes(range(i * chunk_size, (i + 1) * chunk_size))
-
-
-@require_ffmpeg
-class AudioUtilsTest(unittest.TestCase):
-    def test_chunk_bytes_iter_too_big(self):
-        iter_ = iter(chunk_bytes_iter(bytes_iter(chunk_size=3, chunks=2), 10, stride=(0, 0)))
-        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02\x03\x04\x05", "stride": (0, 0)})
-        with self.assertRaises(StopIteration):
-            next(iter_)
-
-    def test_chunk_bytes_iter(self):
-        iter_ = iter(chunk_bytes_iter(bytes_iter(chunk_size=3, chunks=2), 3, stride=(0, 0)))
-        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02", "stride": (0, 0)})
-        self.assertEqual(next(iter_), {"raw": b"\x03\x04\x05", "stride": (0, 0)})
-        with self.assertRaises(StopIteration):
-            next(iter_)
-
-    def test_chunk_bytes_iter_stride(self):
-        iter_ = iter(chunk_bytes_iter(bytes_iter(chunk_size=3, chunks=2), 3, stride=(1, 1)))
-        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02", "stride": (0, 1)})
-        self.assertEqual(next(iter_), {"raw": b"\x01\x02\x03", "stride": (1, 1)})
-        self.assertEqual(next(iter_), {"raw": b"\x02\x03\x04", "stride": (1, 1)})
-        # This is finished, but the chunk_bytes doesn't know it yet.
-        self.assertEqual(next(iter_), {"raw": b"\x03\x04\x05", "stride": (1, 1)})
-        self.assertEqual(next(iter_), {"raw": b"\x04\x05", "stride": (1, 0)})
-        with self.assertRaises(StopIteration):
-            next(iter_)
-
-    def test_chunk_bytes_iter_stride_stream(self):
-        iter_ = iter(chunk_bytes_iter(bytes_iter(chunk_size=3, chunks=2), 5, stride=(1, 1), stream=True))
-        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02", "stride": (0, 0), "partial": True})
-        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02\x03\x04", "stride": (0, 1), "partial": False})
-        self.assertEqual(next(iter_), {"raw": b"\x03\x04\x05", "stride": (1, 0), "partial": False})
-        with self.assertRaises(StopIteration):
-            next(iter_)
-
-        iter_ = iter(chunk_bytes_iter(bytes_iter(chunk_size=3, chunks=3), 5, stride=(1, 1), stream=True))
-        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02", "stride": (0, 0), "partial": True})
-        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02\x03\x04", "stride": (0, 1), "partial": False})
-        self.assertEqual(next(iter_), {"raw": b"\x03\x04\x05\x06\x07", "stride": (1, 1), "partial": False})
-        self.assertEqual(next(iter_), {"raw": b"\x06\x07\x08", "stride": (1, 0), "partial": False})
-        with self.assertRaises(StopIteration):
-            next(iter_)
-
-        iter_ = iter(chunk_bytes_iter(bytes_iter(chunk_size=3, chunks=3), 10, stride=(1, 1), stream=True))
-        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02", "stride": (0, 0), "partial": True})
-        self.assertEqual(next(iter_), {"raw": b"\x00\x01\x02\x03\x04\x05", "stride": (0, 0), "partial": True})
-        self.assertEqual(
-            next(iter_), {"raw": b"\x00\x01\x02\x03\x04\x05\x06\x07\x08", "stride": (0, 0), "partial": True}
-        )
-        self.assertEqual(
-            next(iter_), {"raw": b"\x00\x01\x02\x03\x04\x05\x06\x07\x08", "stride": (0, 0), "partial": False}
-        )
-        with self.assertRaises(StopIteration):
-            next(iter_)
-
-    def test_ffmpeg_no_additional_args(self):
-        mic = ffmpeg_microphone_live(16000, 2.0)
-        mic.close()
-
-    def test_ffmpeg_additional_args(self):
-        mic = ffmpeg_microphone_live(16000, 2.0, ffmpeg_additional_args=["-nostdin"])
-        mic.close()
\ No newline at end of file
diff --git a/tests/transformers/pipelines/test_pipelines_common.py b/tests/transformers/pipelines/test_pipelines_common.py
deleted file mode 100644
index 6fea3178d..000000000
--- a/tests/transformers/pipelines/test_pipelines_common.py
+++ /dev/null
@@ -1,472 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import logging
-import os
-import sys
-import tempfile
-import unittest
-from pathlib import Path
-
-import datasets
-import numpy as np
-
-from mindspore.dataset import GeneratorDataset
-
-from mindnlp.transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    DistilBertForSequenceClassification,
-    TextClassificationPipeline,
-    pipeline,
-)
-from mindnlp.transformers.pipelines import PIPELINE_REGISTRY, get_task
-from mindnlp.transformers.pipelines.base import Pipeline
-from mindnlp.utils.testing_utils import (
-    # TOKEN,
-    # USER,
-    CaptureLogger,
-    RequestCounter,
-    # backend_empty_cache,
-    is_pipeline_test,
-    # is_staging_test,
-    nested_simplify,
-    # require_tensorflow_probability,
-    # require_tf,
-    require_mindspore,
-    # require_mindspore_accelerator,
-    # require_mindspore_or_tf,
-    slow,
-    # torch_device,
-)
-from mindnlp.utils import direct_transformers_import, is_mindspore_available
-from mindnlp.utils import logging as transformers_logging
-
-
-sys.path.append(str(Path(__file__).parent.parent.parent / "utils"))
-
-from ...test_module.custom_pipeline import PairClassificationPipeline  # noqa E402
-
-
-logger = logging.getLogger(__name__)
-
-
-PATH_TO_TRANSFORMERS = os.path.join(Path(__file__).parent.parent.parent.parent, "mindnlp")
-
-
-# Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
-transformers_module = direct_transformers_import(PATH_TO_TRANSFORMERS)
-
-
-class ANY:
-    def __init__(self, *_types):
-        self._types = _types
-
-    def __eq__(self, other):
-        return isinstance(other, self._types)
-
-    def __repr__(self):
-        return f"ANY({', '.join(_type.__name__ for _type in self._types)})"
-
-
-@is_pipeline_test
-class CommonPipelineTest(unittest.TestCase):
-    @require_mindspore
-    def test_pipeline_iteration(self):
-        class MyDataset:
-            data = [
-                "This is a test",
-                "This restaurant is great",
-                "This restaurant is awful",
-            ]
-
-            def __len__(self):
-                return 3
-
-            def __getitem__(self, i):
-                return self.data[i]
-
-        text_classifier = pipeline(
-            task="text-classification", model="hf-internal-testing/tiny-random-distilbert"
-        )
-        dataset = GeneratorDataset(MyDataset(), column_names=['text'])
-        for output in text_classifier(dataset):
-            self.assertEqual(output, {"label": ANY(str), "score": ANY(float)})
-
-    @require_mindspore
-    def test_check_task_auto_inference(self):
-        pipe = pipeline(model="hf-internal-testing/tiny-random-distilbert")
-
-        self.assertIsInstance(pipe, TextClassificationPipeline)
-
-    @require_mindspore
-    def test_pipeline_batch_size_global(self):
-        pipe = pipeline(model="hf-internal-testing/tiny-random-distilbert")
-        self.assertEqual(pipe._batch_size, None)
-        self.assertEqual(pipe._num_workers, None)
-
-        pipe = pipeline(model="hf-internal-testing/tiny-random-distilbert", batch_size=2, num_workers=1)
-        self.assertEqual(pipe._batch_size, 2)
-        self.assertEqual(pipe._num_workers, 1)
-
-    @require_mindspore
-    def test_pipeline_pathlike(self):
-        pipe = pipeline(model="hf-internal-testing/tiny-random-distilbert")
-        with tempfile.TemporaryDirectory() as d:
-            pipe.save_pretrained(d)
-            path = Path(d)
-            newpipe = pipeline(task="text-classification", model=path)
-        self.assertIsInstance(newpipe, TextClassificationPipeline)
-
-    @require_mindspore
-    def test_pipeline_override(self):
-        class MyPipeline(TextClassificationPipeline):
-            pass
-
-        text_classifier = pipeline(model="hf-internal-testing/tiny-random-distilbert", pipeline_class=MyPipeline)
-
-        self.assertIsInstance(text_classifier, MyPipeline)
-
-    def test_check_task(self):
-        task = get_task("gpt2")
-        self.assertEqual(task, "text-generation")
-
-        with self.assertRaises(RuntimeError):
-            # Wrong framework
-            get_task("espnet/siddhana_slurp_entity_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best")
-
-    @require_mindspore
-    def test_iterator_data(self):
-        def data(n: int):
-            for _ in range(n):
-                yield "This is a test"
-
-        pipe = pipeline(model="hf-internal-testing/tiny-random-distilbert")
-
-        results = []
-        for out in pipe(data(10)):
-            self.assertEqual(nested_simplify(out), {"label": "LABEL_0", "score": 0.504})
-            results.append(out)
-        self.assertEqual(len(results), 10)
-
-        # When using multiple workers on streamable data it should still work
-        # This will force using `num_workers=1` with a warning for now.
-        results = []
-        for out in pipe(data(10), num_workers=2):
-            self.assertEqual(nested_simplify(out), {"label": "LABEL_0", "score": 0.504})
-            results.append(out)
-        self.assertEqual(len(results), 10)
-
-    @require_mindspore
-    def test_unbatch_attentions_hidden_states(self):
-        model = DistilBertForSequenceClassification.from_pretrained(
-            "hf-internal-testing/tiny-random-distilbert", output_hidden_states=True, output_attentions=True
-        )
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-distilbert")
-        text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
-
-        # Used to throw an error because `hidden_states` are a tuple of tensors
-        # instead of the expected tensor.
-        outputs = text_classifier(["This is great !"] * 20, batch_size=32)
-        self.assertEqual(len(outputs), 20)
-
-
-@is_pipeline_test
-class PipelineScikitCompatTest(unittest.TestCase):
-    @require_mindspore
-    def test_pipeline_predict(self):
-        data = ["This is a test"]
-
-        text_classifier = pipeline(
-            task="text-classification", model="hf-internal-testing/tiny-random-distilbert"
-        )
-
-        expected_output = [{"label": ANY(str), "score": ANY(float)}]
-        actual_output = text_classifier.predict(data)
-        self.assertEqual(expected_output, actual_output)
-
-    @require_mindspore
-    def test_pipeline_transform(self):
-        data = ["This is a test"]
-
-        text_classifier = pipeline(
-            task="text-classification", model="hf-internal-testing/tiny-random-distilbert"
-        )
-
-        expected_output = [{"label": ANY(str), "score": ANY(float)}]
-        actual_output = text_classifier.transform(data)
-        self.assertEqual(expected_output, actual_output)
-
-
-@is_pipeline_test
-class PipelineUtilsTest(unittest.TestCase):
-
-    @slow
-    @require_mindspore
-    def test_load_default_pipelines(self):
-        import mindspore
-        from mindnlp.transformers.pipelines import SUPPORTED_TASKS
-
-        set_seed_fn = lambda: mindspore.set_seed(0)  # noqa: E731
-        for task in SUPPORTED_TASKS.keys():
-            if task == "table-question-answering":
-                # test table in seperate test due to more dependencies
-                continue
-
-            self.check_default_pipeline(task, "ms", set_seed_fn, self.check_models_equal_pt)
-
-            # clean-up as much as possible GPU memory occupied by PyTorch
-            gc.collect()
-
-
-    # @slow
-    # @require_mindspore
-    # def test_load_default_pipelines_pt_table_qa(self):
-    #     import torch
-
-    #     set_seed_fn = lambda: torch.manual_seed(0)  # noqa: E731
-    #     self.check_default_pipeline("table-question-answering", "pt", set_seed_fn, self.check_models_equal_pt)
-
-    #     # clean-up as much as possible GPU memory occupied by PyTorch
-    #     gc.collect()
-    #     backend_empty_cache(torch_device)
-
-    # @slow
-    # @require_mindspore
-    # @require_mindspore_accelerator
-    # def test_pipeline_accelerator(self):
-    #     pipe = pipeline("text-generation", device=torch_device)
-    #     _ = pipe("Hello")
-
-    # @slow
-    # @require_mindspore
-    # @require_mindspore_accelerator
-    # def test_pipeline_accelerator_indexed(self):
-    #     pipe = pipeline("text-generation", device=torch_device)
-    #     _ = pipe("Hello")
-
-    def check_default_pipeline(self, task, framework, set_seed_fn, check_models_equal_fn):
-        from mindnlp.transformers.pipelines import SUPPORTED_TASKS, pipeline
-
-        task_dict = SUPPORTED_TASKS[task]
-        # test to compare pipeline to manually loading the respective model
-        model = None
-        relevant_auto_classes = task_dict[framework]
-
-        if len(relevant_auto_classes) == 0:
-            # task has no default
-            logger.debug(f"{task} in {framework} has no default")
-            return
-
-        # by default use first class
-        auto_model_cls = relevant_auto_classes[0]
-
-        # retrieve correct model ids
-        if task == "translation":
-            # special case for translation pipeline which has multiple languages
-            model_ids = []
-            revisions = []
-            tasks = []
-            for translation_pair in task_dict["default"].keys():
-                model_id, revision = task_dict["default"][translation_pair]["model"][framework]
-
-                model_ids.append(model_id)
-                revisions.append(revision)
-                tasks.append(task + f"_{'_to_'.join(translation_pair)}")
-        else:
-            # normal case - non-translation pipeline
-            model_id, revision = task_dict["default"]["model"][framework]
-
-            model_ids = [model_id]
-            revisions = [revision]
-            tasks = [task]
-
-        # check for equality
-        for model_id, revision, task in zip(model_ids, revisions, tasks):
-            # load default model
-            try:
-                set_seed_fn()
-                model = auto_model_cls.from_pretrained(model_id)
-            except ValueError:
-                # first auto class is possible not compatible with model, go to next model class
-                auto_model_cls = relevant_auto_classes[1]
-                set_seed_fn()
-                model = auto_model_cls.from_pretrained(model_id)
-
-            # load default pipeline
-            set_seed_fn()
-            default_pipeline = pipeline(task)
-
-            # compare pipeline model with default model
-            models_are_equal = check_models_equal_fn(default_pipeline.model, model)
-            self.assertTrue(models_are_equal, f"{task} model doesn't match pipeline.")
-
-            logger.debug(f"{task} in {framework} succeeded with {model_id}.")
-
-    def check_models_equal_pt(self, model1, model2):
-        models_are_equal = True
-        for model1_p, model2_p in zip(model1.parameters(), model2.parameters()):
-            if model1_p.data.ne(model2_p.data).sum() > 0:
-                models_are_equal = False
-
-        return models_are_equal
-
-    def check_models_equal_tf(self, model1, model2):
-        models_are_equal = True
-        for model1_p, model2_p in zip(model1.weights, model2.weights):
-            if np.abs(model1_p.numpy() - model2_p.numpy()).sum() > 1e-5:
-                models_are_equal = False
-
-        return models_are_equal
-
-
-class CustomPipeline(Pipeline):
-    def _sanitize_parameters(self, **kwargs):
-        preprocess_kwargs = {}
-        if "maybe_arg" in kwargs:
-            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
-        return preprocess_kwargs, {}, {}
-
-    def preprocess(self, text, maybe_arg=2):
-        input_ids = self.tokenizer(text, return_tensors="ms")
-        return input_ids
-
-    def _forward(self, model_inputs):
-        outputs = self.model(**model_inputs)
-        return outputs
-
-    def postprocess(self, model_outputs):
-        return model_outputs["logits"].softmax(-1).numpy()
-
-
-@is_pipeline_test
-class CustomPipelineTest(unittest.TestCase):
-    def test_warning_logs(self):
-        transformers_logging.set_verbosity_debug()
-        logger_ = transformers_logging.get_logger("mindnlp.transformers.pipelines.base")
-
-        alias = "text-classification"
-        # Get the original task, so we can restore it at the end.
-        # (otherwise the subsequential tests in `TextClassificationPipelineTests` will fail)
-        _, original_task, _ = PIPELINE_REGISTRY.check_task(alias)
-
-        try:
-            with CaptureLogger(logger_) as cm:
-                PIPELINE_REGISTRY.register_pipeline(alias, PairClassificationPipeline)
-            self.assertIn(f"{alias} is already registered", cm.out)
-        finally:
-            # restore
-            PIPELINE_REGISTRY.supported_tasks[alias] = original_task
-
-    def test_register_pipeline(self):
-        PIPELINE_REGISTRY.register_pipeline(
-            "custom-text-classification",
-            pipeline_class=PairClassificationPipeline,
-            model=AutoModelForSequenceClassification if is_mindspore_available() else None,
-            default={"ms": "hf-internal-testing/tiny-random-distilbert"},
-            type="text",
-        )
-        assert "custom-text-classification" in PIPELINE_REGISTRY.get_supported_tasks()
-
-        _, task_def, _ = PIPELINE_REGISTRY.check_task("custom-text-classification")
-        self.assertEqual(task_def["ms"], (AutoModelForSequenceClassification,) if is_mindspore_available() else ())
-        self.assertEqual(task_def["type"], "text")
-        self.assertEqual(task_def["impl"], PairClassificationPipeline)
-        self.assertEqual(task_def["default"], {"model": {"ms": "hf-internal-testing/tiny-random-distilbert"}})
-
-        # Clean registry for next tests.
-        del PIPELINE_REGISTRY.supported_tasks["custom-text-classification"]
-
-    # def test_dynamic_pipeline(self):
-    #     PIPELINE_REGISTRY.register_pipeline(
-    #         "pair-classification",
-    #         pipeline_class=PairClassificationPipeline,
-    #         model=AutoModelForSequenceClassification if is_mindspore_available() else None,
-    #     )
-
-    #     classifier = pipeline("pair-classification", model="hf-internal-testing/tiny-random-bert")
-
-    #     # Clean registry as we won't need the pipeline to be in it for the rest to work.
-    #     del PIPELINE_REGISTRY.supported_tasks["pair-classification"]
-
-    #     with tempfile.TemporaryDirectory() as tmp_dir:
-    #         classifier.save_pretrained(tmp_dir)
-    #         # checks
-    #         self.assertDictEqual(
-    #             classifier.model.config.custom_pipelines,
-    #             {
-    #                 "pair-classification": {
-    #                     "impl": "custom_pipeline.PairClassificationPipeline",
-    #                     "ms": ("AutoModelForSequenceClassification",) if is_mindspore_available() else (),
-    #                 }
-    #             },
-    #         )
-    #         # Fails if the user forget to pass along `trust_remote_code=True`
-    #         with self.assertRaises(ValueError):
-    #             _ = pipeline(model=tmp_dir)
-
-    #         new_classifier = pipeline(model=tmp_dir)
-    #         # Using trust_remote_code=False forces the traditional pipeline tag
-    #         old_classifier = pipeline("text-classification", model=tmp_dir)
-    #     # Can't make an isinstance check because the new_classifier is from the PairClassificationPipeline class of a
-    #     # dynamic module
-    #     self.assertEqual(new_classifier.__class__.__name__, "PairClassificationPipeline")
-    #     self.assertEqual(new_classifier.task, "pair-classification")
-    #     results = new_classifier("I hate you", second_text="I love you")
-    #     self.assertDictEqual(
-    #         nested_simplify(results),
-    #         {"label": "LABEL_0", "score": 0.505, "logits": [-0.003, -0.024]},
-    #     )
-
-    #     self.assertEqual(old_classifier.__class__.__name__, "TextClassificationPipeline")
-    #     self.assertEqual(old_classifier.task, "text-classification")
-    #     results = old_classifier("I hate you", text_pair="I love you")
-    #     self.assertListEqual(
-    #         nested_simplify(results),
-    #         [{"label": "LABEL_0", "score": 0.505}],
-    #     )
-
-    # @require_mindspore_or_tf
-    def test_cached_pipeline_has_minimum_calls_to_head(self):
-        # Make sure we have cached the pipeline.
-        _ = pipeline("text-classification", model="hf-internal-testing/tiny-random-bert")
-        with RequestCounter() as counter:
-            _ = pipeline("text-classification", model="hf-internal-testing/tiny-random-bert")
-        # self.assertEqual(counter["GET"], 0)
-        # self.assertEqual(counter["HEAD"], 1)
-        # self.assertEqual(counter.total_calls, 1)
-
-    # @require_mindspore
-    # def test_chunk_pipeline_batching_single_file(self):
-    #     # Make sure we have cached the pipeline.
-    #     pipe = pipeline(model="hf-internal-testing/tiny-random-Wav2Vec2ForCTC")
-    #     ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-    #     audio = ds[40]["audio"]["array"]
-
-    #     pipe = pipeline(model="hf-internal-testing/tiny-random-Wav2Vec2ForCTC")
-    #     # For some reason scoping doesn't work if not using `self.`
-    #     self.COUNT = 0
-    #     forward = pipe.model.forward
-
-    #     def new_forward(*args, **kwargs):
-    #         self.COUNT += 1
-    #         return forward(*args, **kwargs)
-
-    #     pipe.model.forward = new_forward
-
-    #     for out in pipe(audio, return_timestamps="char", chunk_length_s=3, stride_length_s=[1, 1], batch_size=1024):
-    #         pass
-
-    #     self.assertEqual(self.COUNT, 1)
diff --git a/tests/transformers/pipelines/test_pipelines_depth_estimation.py b/tests/transformers/pipelines/test_pipelines_depth_estimation.py
deleted file mode 100644
index 17dfc1bd6..000000000
--- a/tests/transformers/pipelines/test_pipelines_depth_estimation.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from huggingface_hub import DepthEstimationOutput
-from huggingface_hub.utils import insecure_hashlib
-
-from mindnlp.transformers import MODEL_FOR_DEPTH_ESTIMATION_MAPPING
-from mindnlp.utils import is_mindspore_available, is_vision_available
-from mindnlp.transformers.pipelines import DepthEstimationPipeline, pipeline
-from mindnlp.utils.testing_utils import (
-    is_pipeline_test,
-    nested_simplify,
-    require_mindspore,
-    require_vision,
-    slow,
-)
-
-from .test_pipelines_common import ANY
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-if is_vision_available():
-    from PIL import Image
-else:
-
-    class Image:
-        @staticmethod
-        def open(*args, **kwargs):
-            pass
-
-
-def hashimage(image: Image) -> str:
-    m = insecure_hashlib.md5(image.tobytes())
-    return m.hexdigest()
-
-
-@is_pipeline_test
-@require_vision
-# @require_timm
-@require_mindspore
-class DepthEstimationPipelineTests(unittest.TestCase):
-    model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING
-
-    def get_test_pipeline(
-        self,
-        model,
-        tokenizer=None,
-        image_processor=None,
-        feature_extractor=None,
-        processor=None,
-        torch_dtype="float32",
-    ):
-        depth_estimator = DepthEstimationPipeline(
-            model=model,
-            tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
-            image_processor=image_processor,
-            processor=processor,
-            torch_dtype=torch_dtype,
-        )
-        return depth_estimator, [
-            "./tests/fixtures/tests_samples/COCO/000000039769.png",
-            "./tests/fixtures/tests_samples/COCO/000000039769.png",
-        ]
-
-    def run_pipeline_test(self, depth_estimator, examples):
-        outputs = depth_estimator("./tests/fixtures/tests_samples/COCO/000000039769.png")
-        self.assertEqual({"predicted_depth": ANY(mindspore.Tensor), "depth": ANY(Image.Image)}, outputs)
-        import datasets
-
-        # we use revision="refs/pr/1" until the PR is merged
-        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
-        outputs = depth_estimator(
-            [
-                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
-                "http://images.cocodataset.org/val2017/000000039769.jpg",
-                # RGBA
-                dataset[0]["image"],
-                # LA
-                dataset[1]["image"],
-                # L
-                dataset[2]["image"],
-            ]
-        )
-        self.assertEqual(
-            [
-                {"predicted_depth": ANY(mindspore.Tensor), "depth": ANY(Image.Image)},
-                {"predicted_depth": ANY(mindspore.Tensor), "depth": ANY(Image.Image)},
-                {"predicted_depth": ANY(mindspore.Tensor), "depth": ANY(Image.Image)},
-                {"predicted_depth": ANY(mindspore.Tensor), "depth": ANY(Image.Image)},
-                {"predicted_depth": ANY(mindspore.Tensor), "depth": ANY(Image.Image)},
-            ],
-            outputs,
-        )
-
-    @slow
-    @require_mindspore
-    def test_large_model(self):
-        model_id = "Intel/dpt-large"
-        depth_estimator = pipeline("depth-estimation", model=model_id)
-        outputs = depth_estimator("http://images.cocodataset.org/val2017/000000039769.jpg")
-        outputs["depth"] = hashimage(outputs["depth"])
-
-        # This seems flaky.
-        # self.assertEqual(outputs["depth"], "1a39394e282e9f3b0741a90b9f108977")
-        self.assertEqual(nested_simplify(outputs["predicted_depth"].max().item()), 29.306)
-        self.assertAlmostEqual(nested_simplify(outputs["predicted_depth"].min().item()), 2.662, 2)
-
-    @require_mindspore
-    def test_small_model(self):
-        # This is highly irregular to have no small tests.
-        self.skipTest(reason="There is not hf-internal-testing tiny model for either GLPN nor DPT")
-
-    @require_mindspore
-    def test_multiprocess(self):
-        depth_estimator = pipeline(
-            model="hf-internal-testing/tiny-random-DepthAnythingForDepthEstimation",
-            num_workers=2,
-        )
-        outputs = depth_estimator(
-            [
-                "./tests/fixtures/tests_samples/COCO/000000039769.png",
-                "./tests/fixtures/tests_samples/COCO/000000039769.png",
-            ]
-        )
-        self.assertEqual(
-            [
-                {"predicted_depth": ANY(mindspore.Tensor), "depth": ANY(Image.Image)},
-                {"predicted_depth": ANY(mindspore.Tensor), "depth": ANY(Image.Image)},
-            ],
-            outputs,
-        )
diff --git a/tests/transformers/pipelines/test_pipelines_document_question_answering.py b/tests/transformers/pipelines/test_pipelines_document_question_answering.py
deleted file mode 100644
index 4c11b4090..000000000
--- a/tests/transformers/pipelines/test_pipelines_document_question_answering.py
+++ /dev/null
@@ -1,344 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import pytest
-
-from mindnlp.transformers import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING, pipeline, AutoTokenizer
-from mindnlp.transformers.pipelines.document_question_answering import apply_tesseract
-
-from mindnlp.utils.testing_utils import is_pipeline_test, require_vision, slow, \
-    nested_simplify, require_pytesseract
-
-from mindnlp.utils import is_vision_available, require_mindspore
-
-from .test_pipelines_common import ANY
-
-if is_vision_available():
-    from PIL import Image
-
-    from mindnlp.transformers.image_utils import load_image
-else:
-
-    class Image:
-        @staticmethod
-        def open(*args, **kwargs):
-            pass
-
-
-    def load_image(_):
-        return None
-
-# This is a pinned image from a specific revision of a document question answering space, hosted by HuggingFace,
-# so we can expect it to be available.
-INVOICE_URL = (
-    "https://hf-mirror.com/spaces/impira/docquery/resolve/2f6c96314dc84dfda62d40de9da55f2f5165d403/invoice.png"
-)
-
-
-@is_pipeline_test
-@require_mindspore
-@require_vision
-class DocumentQuestionAnsweringPipelineTests(unittest.TestCase):
-    model_mapping = MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING
-
-    @require_pytesseract
-    @require_vision
-    def get_test_pipeline(self, model, tokenizer, processor):
-        dqa_pipeline = pipeline(
-            "document-question-answering", model=model, tokenizer=tokenizer, image_processor=processor
-        )
-
-        image = "INVOICE_URL"
-        word_boxes = list(zip(*apply_tesseract(load_image(image), None, "")))
-        question = "What is the placebo?"
-        examples = [
-            {
-                "image": load_image(image),
-                "question": question,
-            },
-            {
-                "image": image,
-                "question": question,
-            },
-            {
-                "image": image,
-                "question": question,
-                "word_boxes": word_boxes,
-            },
-        ]
-        return dqa_pipeline, examples
-
-    def run_pipeline_test(self, dqa_pipeline, examples):
-        outputs = dqa_pipeline(examples, top_k=2)
-        self.assertEqual(
-            outputs,
-            [
-                [
-                    {"score": ANY(float), "answer": ANY(str), "start": ANY(int), "end": ANY(int)},
-                    {"score": ANY(float), "answer": ANY(str), "start": ANY(int), "end": ANY(int)},
-                ]
-            ]
-            * 3,
-        )
-
-    @require_mindspore
-    @require_pytesseract
-    @pytest.mark.skip
-    def test_small_model_ms(self):
-        dqa_pipeline = pipeline("document-question-answering", model="hf-internal-testing/tiny-random-layoutlmv2")
-        image = INVOICE_URL
-        question = "How many cats are there?"
-
-        expected_output = [
-            {"score": 0.0001, "answer": "oy 2312/2019", "start": 38, "end": 39},
-            {"score": 0.0001, "answer": "oy 2312/2019 DUE", "start": 38, "end": 40},
-        ]
-        outputs = dqa_pipeline(image=image, question=question, top_k=2)
-        self.assertEqual(nested_simplify(outputs, decimals=4), expected_output)
-
-        outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
-        self.assertEqual(nested_simplify(outputs, decimals=4), expected_output)
-
-        # This image does not detect ANY text in it, meaning layoutlmv2 should fail.
-        # Empty answer probably
-        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
-        outputs = dqa_pipeline(image=image, question=question, top_k=2)
-        self.assertEqual(outputs, [])
-
-        # We can optionnally pass directly the words and bounding boxes
-        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
-        words = []
-        boxes = []
-        outputs = dqa_pipeline(image=image, question=question, words=words, boxes=boxes, top_k=2)
-        self.assertEqual(outputs, [])
-
-    @slow
-    @require_mindspore
-    @require_pytesseract
-    def test_large_model(self):
-        dqa_pipeline = pipeline(
-            "document-question-answering",
-            model="tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa",
-            revision="9977165",
-        )
-        image = INVOICE_URL
-        question = "What is the invoice number?"
-
-        outputs = dqa_pipeline(image=image, question=question, top_k=2)
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.9944, "answer": "us-001", "start": 16, "end": 16},
-                {"score": 0.0009, "answer": "us-001", "start": 16, "end": 16},
-            ],
-        )
-
-        outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.9944, "answer": "us-001", "start": 16, "end": 16},
-                {"score": 0.0009, "answer": "us-001", "start": 16, "end": 16},
-            ],
-        )
-
-        outputs = dqa_pipeline(
-            [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2
-        )
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                [
-                    {"score": 0.9944, "answer": "us-001", "start": 16, "end": 16},
-                    {"score": 0.0009, "answer": "us-001", "start": 16, "end": 16},
-                ],
-            ]
-            * 2,
-        )
-
-    @slow
-    @require_mindspore
-    @require_pytesseract
-    def test_large_model_chunk(self):
-        dqa_pipeline = pipeline(
-            "document-question-answering",
-            model="tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa",
-            revision="9977165",
-            max_seq_len=50,
-        )
-        image = INVOICE_URL
-        question = "What is the invoice number?"
-
-        outputs = dqa_pipeline(image=image, question=question, top_k=2, padding=True)
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.9974, "answer": "1110212019", "start": 23, "end": 23},
-                {"score": 0.9948, "answer": "us-001", "start": 16, "end": 16},
-            ],
-        )
-
-        outputs = dqa_pipeline({"image": image, "question": question}, top_k=2, padding=True)
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.9974, "answer": "1110212019", "start": 23, "end": 23},
-                {"score": 0.9948, "answer": "us-001", "start": 16, "end": 16},
-            ],
-        )
-
-        outputs = dqa_pipeline(
-            [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2, padding=True
-        )
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                [
-                    {"score": 0.9974, "answer": "1110212019", "start": 23, "end": 23},
-                    {"score": 0.9948, "answer": "us-001", "start": 16, "end": 16},
-                ]
-            ]
-            * 2,
-        )
-
-    @slow
-    @require_mindspore
-    @require_pytesseract
-    @require_vision
-    def test_large_model_layoutlm(self):
-        tokenizer = AutoTokenizer.from_pretrained(
-            "impira/layoutlm-document-qa", revision="3dc6de3", add_prefix_space=True
-        )
-        dqa_pipeline = pipeline(
-            "document-question-answering",
-            model="impira/layoutlm-document-qa",
-            tokenizer=tokenizer,
-            revision="3dc6de3",
-        )
-        image = INVOICE_URL
-        question = "What is the invoice number?"
-
-        outputs = dqa_pipeline(image=image, question=question, top_k=2)
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.4251, "answer": "us-001", "start": 16, "end": 16},
-                {"score": 0.0819, "answer": "1110212019", "start": 23, "end": 23},
-            ],
-        )
-
-        outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.4251, "answer": "us-001", "start": 16, "end": 16},
-                {"score": 0.0819, "answer": "1110212019", "start": 23, "end": 23},
-            ],
-        )
-
-        outputs = dqa_pipeline(
-            [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2
-        )
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                [
-                    {"score": 0.4251, "answer": "us-001", "start": 16, "end": 16},
-                    {"score": 0.0819, "answer": "1110212019", "start": 23, "end": 23},
-                ]
-            ]
-            * 2,
-        )
-
-        word_boxes = list(zip(*apply_tesseract(load_image(image), None, "")))
-
-        # This model should also work if `image` is set to None
-        outputs = dqa_pipeline({"image": None, "word_boxes": word_boxes, "question": question}, top_k=2)
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.4251, "answer": "us-001", "start": 16, "end": 16},
-                {"score": 0.0819, "answer": "1110212019", "start": 23, "end": 23},
-            ],
-        )
-
-    @slow
-    @require_mindspore
-    @require_pytesseract
-    @require_vision
-    def test_large_model_layoutlm_chunk(self):
-        tokenizer = AutoTokenizer.from_pretrained(
-            "impira/layoutlm-document-qa", revision="3dc6de3", add_prefix_space=True
-        )
-        dqa_pipeline = pipeline(
-            "document-question-answering",
-            model="impira/layoutlm-document-qa",
-            tokenizer=tokenizer,
-            revision="3dc6de3",
-            max_seq_len=50,
-        )
-        image = INVOICE_URL
-        question = "What is the invoice number?"
-
-        outputs = dqa_pipeline(image=image, question=question, top_k=2, padding=True)
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.9999, "answer": "us-001", "start": 16, "end": 16},
-                {"score": 0.9998, "answer": "us-001", "start": 16, "end": 16},
-            ],
-        )
-
-        outputs = dqa_pipeline(
-            [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2, padding=True
-        )
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                [
-                    {"score": 0.9999, "answer": "us-001", "start": 16, "end": 16},
-                    {"score": 0.9998, "answer": "us-001", "start": 16, "end": 16},
-                ]
-            ]
-            * 2,
-        )
-
-        word_boxes = list(zip(*apply_tesseract(load_image(image), None, "")))
-
-        # This model should also work if `image` is set to None
-        outputs = dqa_pipeline({"image": None, "word_boxes": word_boxes, "question": question}, top_k=2, padding=True)
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.9999, "answer": "us-001", "start": 16, "end": 16},
-                {"score": 0.9998, "answer": "us-001", "start": 16, "end": 16},
-            ],
-        )
-
-    @slow
-    @require_mindspore
-    def test_large_model_donut(self):
-        dqa_pipeline = pipeline(
-            "document-question-answering",
-            model="naver-clova-ix/donut-base-finetuned-docvqa",
-            tokenizer=AutoTokenizer.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa"),
-            image_processor="naver-clova-ix/donut-base-finetuned-docvqa",
-        )
-
-        image = INVOICE_URL
-        question = "What is the invoice number?"
-        outputs = dqa_pipeline(image=image, question=question, top_k=2)
-        self.assertEqual(nested_simplify(outputs, decimals=4), [{"answer": "us-001"}])
diff --git a/tests/transformers/pipelines/test_pipelines_feature_extraction.py b/tests/transformers/pipelines/test_pipelines_feature_extraction.py
deleted file mode 100644
index 20ef54e44..000000000
--- a/tests/transformers/pipelines/test_pipelines_feature_extraction.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from mindnlp.transformers import (
-    FEATURE_EXTRACTOR_MAPPING,
-    IMAGE_PROCESSOR_MAPPING,
-    MODEL_MAPPING,
-    FeatureExtractionPipeline,
-    LxmertConfig,
-    is_mindspore_available,
-    pipeline,
-)
-from mindnlp.utils.testing_utils import is_pipeline_test, nested_simplify, require_mindspore
-
-
-if is_mindspore_available():
-    from mindnlp.core import ops
-
-
-@is_pipeline_test
-class FeatureExtractionPipelineTests(unittest.TestCase):
-    model_mapping = MODEL_MAPPING
-
-    @require_mindspore
-    def test_small_model(self):
-        feature_extractor = pipeline(
-            task="feature-extraction", model="hf-internal-testing/tiny-random-distilbert"
-        )
-        outputs = feature_extractor("This is a test")
-        self.assertEqual(
-            nested_simplify(outputs),
-            [[[2.287, 1.234, 0.042, 1.53, 1.306, 0.879, -0.526, -1.71, -1.276, 0.756, -0.775, -1.048, -0.25, -0.595, -0.137, -0.598, 2.022, -0.812, 0.284, -0.488, -0.391, -0.403, -0.525, -0.061, -0.228, 1.086, 0.378, -0.14, 0.599, -0.087, -2.259, -0.098], [1.676, 0.232, -1.508, -0.145, 1.798, -1.388, 1.331, -0.37, -0.939, 0.043, 0.06, -0.414, -1.408, 0.24, 0.622, -0.55, -0.569, 1.873, -0.706, 1.924, -0.254, 1.927, -0.423, 0.152, -0.952, 0.509, -0.496, -0.968, 0.093, -1.049, -0.65, 0.312], [0.207, -0.775, -1.822, 0.321, -0.71, -0.201, 0.3, 1.146, -0.233, -0.753, -0.305, 1.309, -1.47, -0.21, 1.802, -1.555, -1.175, 1.323, -0.303, 0.722, -0.076, 0.103, -1.406, 1.931, 0.091, 0.237, 1.172, 1.607, 0.253, -0.9, -1.068, 0.438], [0.615, 1.077, 0.171, -0.175, 1.3, 0.901, -0.653, -0.138, 0.341, -0.654, -0.184, -0.441, -0.424, 0.356, -0.075, 0.26, -1.023, 0.814, 0.524, -0.904, -0.204, -0.623, 1.234, -1.03, 2.594, 0.56, 1.831, -0.199, -1.508, -0.492, -1.687, -2.165], [0.129, 0.008, -1.279, -0.412, -0.004, 1.663, 0.196, 0.104, 0.123, 0.119, 0.635, 1.757, 2.334, -0.799, -1.626, -1.26, 0.595, -0.316, -1.399, 0.232, 0.264, 1.386, -1.171, -0.256, -0.256, -1.944, 1.168, -0.368, -0.714, -0.51, 0.454, 1.148], [-0.32, 0.29, -1.309, -0.177, 0.453, 0.636, -0.024, 0.509, 0.931, -1.754, -1.575, 0.786, 0.046, -1.165, -1.416, 1.373, 1.293, -0.285, -1.541, -1.186, -0.106, -0.994, 2.001, 0.972, -0.02, 1.654, -0.236, 0.643, 1.02, 0.572, -0.914, -0.154], [0.7, -0.937, 0.441, 0.25, 0.78, -0.022, 0.282, -0.095, 1.558, -0.336, 1.706, 0.884, 1.28, 0.198, -0.796, 1.218, -1.769, 1.197, -0.342, -0.177, -0.645, 1.364, 0.008, -0.597, -0.484, -2.772, -0.696, -0.632, -0.34, -1.527, -0.562, 0.862], [2.504, 0.831, -1.271, -0.033, 0.298, -0.735, 1.339, 1.74, 0.233, -1.424, -0.819, -0.761, 0.291, 0.853, -0.092, -0.885, 0.164, 1.025, 0.907, 0.749, -1.515, -0.545, -1.365, 0.271, 0.034, -2.005, 0.031, 0.244, 0.621, 0.176, 0.336, -1.196], [-0.711, 0.591, -1.001, -0.946, 0.784, -1.66, 1.545, 0.799, -0.857, 1.148, 0.213, -0.285, 0.464, -0.139, 0.79, -1.663, -1.121, 0.575, -0.178, -0.508, 1.565, -0.242, -0.346, 1.024, -1.135, -0.158, -2.101, 0.275, 2.009, -0.425, 0.716, 0.981], [0.912, -1.186, -0.846, -0.421, -1.315, -0.827, 0.309, 0.533, 1.029, -2.343, 1.513, -1.238, 1.487, -0.849, 0.896, -0.927, -0.459, 0.159, 0.177, 0.873, 0.935, 1.433, -0.485, 0.737, 1.327, -0.338, 1.608, -0.47, -0.445, -1.118, -0.213, -0.446], [-0.434, -1.362, -1.098, -1.068, 1.507, 0.003, 0.413, -0.395, 0.897, -0.237, 1.405, -0.344, 1.693, 0.677, 0.097, -0.257, -0.602, 1.026, -1.229, 0.855, -0.713, 1.014, 0.443, 0.238, 0.425, -2.184, 1.933, -1.157, -1.132, -0.597, -0.785, 0.967], [0.58, -0.971, 0.789, -0.468, -0.576, 1.779, 1.747, 1.715, -1.939, 0.125, 0.656, -0.042, -1.024, -1.767, 0.107, -0.408, -0.866, -1.774, 1.248, 0.939, -0.033, 1.523, 1.168, -0.744, 0.209, -0.168, -0.316, 0.207, -0.432, 0.047, -0.646, -0.664], [-0.185, -0.613, -1.695, 1.602, -0.32, -0.277, 0.967, 0.728, -0.965, -0.234, 1.069, -0.63, -1.631, 0.711, 0.426, 1.298, -0.191, -0.467, -0.771, 0.971, -0.118, -1.577, -2.064, -0.055, -0.59, 0.642, -0.997, 1.251, 0.538, 1.367, 0.106, 1.704]]])  # fmt: skip
-
-
-    @require_mindspore
-    def test_tokenization_small_model(self):
-        feature_extractor = pipeline(
-            task="feature-extraction", model="hf-internal-testing/tiny-random-distilbert"
-        )
-        # test with empty parameters
-        outputs = feature_extractor("This is a test")
-        self.assertEqual(
-            nested_simplify(outputs),
-            [[[2.287, 1.234, 0.042, 1.53, 1.306, 0.879, -0.526, -1.71, -1.276, 0.756, -0.775, -1.048, -0.25, -0.595, -0.137, -0.598, 2.022, -0.812, 0.284, -0.488, -0.391, -0.403, -0.525, -0.061, -0.228, 1.086, 0.378, -0.14, 0.599, -0.087, -2.259, -0.098], [1.676, 0.232, -1.508, -0.145, 1.798, -1.388, 1.331, -0.37, -0.939, 0.043, 0.06, -0.414, -1.408, 0.24, 0.622, -0.55, -0.569, 1.873, -0.706, 1.924, -0.254, 1.927, -0.423, 0.152, -0.952, 0.509, -0.496, -0.968, 0.093, -1.049, -0.65, 0.312], [0.207, -0.775, -1.822, 0.321, -0.71, -0.201, 0.3, 1.146, -0.233, -0.753, -0.305, 1.309, -1.47, -0.21, 1.802, -1.555, -1.175, 1.323, -0.303, 0.722, -0.076, 0.103, -1.406, 1.931, 0.091, 0.237, 1.172, 1.607, 0.253, -0.9, -1.068, 0.438], [0.615, 1.077, 0.171, -0.175, 1.3, 0.901, -0.653, -0.138, 0.341, -0.654, -0.184, -0.441, -0.424, 0.356, -0.075, 0.26, -1.023, 0.814, 0.524, -0.904, -0.204, -0.623, 1.234, -1.03, 2.594, 0.56, 1.831, -0.199, -1.508, -0.492, -1.687, -2.165], [0.129, 0.008, -1.279, -0.412, -0.004, 1.663, 0.196, 0.104, 0.123, 0.119, 0.635, 1.757, 2.334, -0.799, -1.626, -1.26, 0.595, -0.316, -1.399, 0.232, 0.264, 1.386, -1.171, -0.256, -0.256, -1.944, 1.168, -0.368, -0.714, -0.51, 0.454, 1.148], [-0.32, 0.29, -1.309, -0.177, 0.453, 0.636, -0.024, 0.509, 0.931, -1.754, -1.575, 0.786, 0.046, -1.165, -1.416, 1.373, 1.293, -0.285, -1.541, -1.186, -0.106, -0.994, 2.001, 0.972, -0.02, 1.654, -0.236, 0.643, 1.02, 0.572, -0.914, -0.154], [0.7, -0.937, 0.441, 0.25, 0.78, -0.022, 0.282, -0.095, 1.558, -0.336, 1.706, 0.884, 1.28, 0.198, -0.796, 1.218, -1.769, 1.197, -0.342, -0.177, -0.645, 1.364, 0.008, -0.597, -0.484, -2.772, -0.696, -0.632, -0.34, -1.527, -0.562, 0.862], [2.504, 0.831, -1.271, -0.033, 0.298, -0.735, 1.339, 1.74, 0.233, -1.424, -0.819, -0.761, 0.291, 0.853, -0.092, -0.885, 0.164, 1.025, 0.907, 0.749, -1.515, -0.545, -1.365, 0.271, 0.034, -2.005, 0.031, 0.244, 0.621, 0.176, 0.336, -1.196], [-0.711, 0.591, -1.001, -0.946, 0.784, -1.66, 1.545, 0.799, -0.857, 1.148, 0.213, -0.285, 0.464, -0.139, 0.79, -1.663, -1.121, 0.575, -0.178, -0.508, 1.565, -0.242, -0.346, 1.024, -1.135, -0.158, -2.101, 0.275, 2.009, -0.425, 0.716, 0.981], [0.912, -1.186, -0.846, -0.421, -1.315, -0.827, 0.309, 0.533, 1.029, -2.343, 1.513, -1.238, 1.487, -0.849, 0.896, -0.927, -0.459, 0.159, 0.177, 0.873, 0.935, 1.433, -0.485, 0.737, 1.327, -0.338, 1.608, -0.47, -0.445, -1.118, -0.213, -0.446], [-0.434, -1.362, -1.098, -1.068, 1.507, 0.003, 0.413, -0.395, 0.897, -0.237, 1.405, -0.344, 1.693, 0.677, 0.097, -0.257, -0.602, 1.026, -1.229, 0.855, -0.713, 1.014, 0.443, 0.238, 0.425, -2.184, 1.933, -1.157, -1.132, -0.597, -0.785, 0.967], [0.58, -0.971, 0.789, -0.468, -0.576, 1.779, 1.747, 1.715, -1.939, 0.125, 0.656, -0.042, -1.024, -1.767, 0.107, -0.408, -0.866, -1.774, 1.248, 0.939, -0.033, 1.523, 1.168, -0.744, 0.209, -0.168, -0.316, 0.207, -0.432, 0.047, -0.646, -0.664], [-0.185, -0.613, -1.695, 1.602, -0.32, -0.277, 0.967, 0.728, -0.965, -0.234, 1.069, -0.63, -1.631, 0.711, 0.426, 1.298, -0.191, -0.467, -0.771, 0.971, -0.118, -1.577, -2.064, -0.055, -0.59, 0.642, -0.997, 1.251, 0.538, 1.367, 0.106, 1.704]]])  # fmt: skip
-
-        # test with various tokenizer parameters
-        tokenize_kwargs = {"max_length": 3}
-        outputs = feature_extractor("This is a test", tokenize_kwargs=tokenize_kwargs)
-        self.assertEqual(np.squeeze(outputs).shape, (3, 32))
-
-        tokenize_kwargs = {"truncation": True, "padding": True, "max_length": 4}
-        outputs = feature_extractor(
-            ["This is a test", "This", "This is", "This is a", "This is a test test test test"],
-            tokenize_kwargs=tokenize_kwargs,
-        )
-        self.assertEqual(np.squeeze(outputs).shape, (5, 4, 32))
-
-        tokenize_kwargs = {"padding": True, "max_length": 4}
-        outputs = feature_extractor(
-            ["This is a test", "This", "This is", "This is a", "This is a test test test test"],
-            truncation=True,
-            tokenize_kwargs=tokenize_kwargs,
-        )
-        self.assertEqual(np.squeeze(outputs).shape, (5, 4, 32))
-
-        # raise value error if truncation parameter given for two places
-        tokenize_kwargs = {"truncation": True}
-        with self.assertRaises(ValueError):
-            _ = feature_extractor(
-                ["This is a test", "This", "This is", "This is a", "This is a test test test test"],
-                truncation=True,
-                tokenize_kwargs=tokenize_kwargs,
-            )
-
-    @require_mindspore
-    def test_return_tensors(self):
-        feature_extractor = pipeline(
-            task="feature-extraction", model="hf-internal-testing/tiny-random-distilbert"
-        )
-        outputs = feature_extractor("This is a test", return_tensors=True)
-        self.assertTrue(ops.is_tensor(outputs))
-
-    def get_shape(self, input_, shape=None):
-        if shape is None:
-            shape = []
-        if isinstance(input_, list):
-            subshapes = [self.get_shape(in_, shape) for in_ in input_]
-            if all(s == 0 for s in subshapes):
-                shape.append(len(input_))
-            else:
-                subshape = subshapes[0]
-                shape = [len(input_), *subshape]
-        elif isinstance(input_, float):
-            return 0
-        else:
-            raise TypeError("We expect lists of floats, nothing else")
-        return shape
-
-    def get_test_pipeline(
-        self,
-        model,
-        tokenizer=None,
-        image_processor=None,
-        feature_extractor=None,
-        processor=None,
-        torch_dtype="float32",
-    ):
-        if tokenizer is None:
-            self.skipTest(reason="No tokenizer")
-        elif (
-            type(model.config) in FEATURE_EXTRACTOR_MAPPING
-            or isinstance(model.config, LxmertConfig)
-            or type(model.config) in IMAGE_PROCESSOR_MAPPING
-        ):
-            self.skipTest(
-                reason="This is a bimodal model, we need to find a more consistent way to switch on those models."
-            )
-        elif model.config.is_encoder_decoder:
-            self.skipTest(
-                """encoder_decoder models are trickier for this pipeline.
-                Do we want encoder + decoder inputs to get some featues?
-                Do we want encoder only features ?
-                For now ignore those.
-                """
-            )
-        feature_extractor_pipeline = FeatureExtractionPipeline(
-            model=model,
-            tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
-            image_processor=image_processor,
-            processor=processor,
-            torch_dtype=torch_dtype,
-        )
-        return feature_extractor_pipeline, ["This is a test", "This is another test"]
-
-    def run_pipeline_test(self, feature_extractor, examples):
-        outputs = feature_extractor("This is a test")
-
-        shape = self.get_shape(outputs)
-        self.assertEqual(shape[0], 1)
-
-        # If we send too small input
-        # there's a bug within FunnelModel (output with shape [1, 4, 2, 1] doesn't match the broadcast shape [1, 4, 2, 2])
-        outputs = feature_extractor(["This is a test", "Another longer test"])
-        shape = self.get_shape(outputs)
-        self.assertEqual(shape[0], 2)
-
-        outputs = feature_extractor("This is a test" * 100, truncation=True)
-        shape = self.get_shape(outputs)
-        self.assertEqual(shape[0], 1)
\ No newline at end of file
diff --git a/tests/transformers/pipelines/test_pipelines_fill_mask.py b/tests/transformers/pipelines/test_pipelines_fill_mask.py
deleted file mode 100644
index 7dff33a0b..000000000
--- a/tests/transformers/pipelines/test_pipelines_fill_mask.py
+++ /dev/null
@@ -1,411 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-from mindnlp.transformers import MODEL_FOR_MASKED_LM_MAPPING, FillMaskPipeline, pipeline
-from mindnlp.transformers.pipelines import PipelineException
-from mindnlp.utils.testing_utils import (
-    backend_empty_cache,
-    is_pipeline_test,
-    is_mindspore_available,
-    nested_simplify,
-    require_mindspore,
-    slow,
-)
-
-from .test_pipelines_common import ANY
-
-
-@is_pipeline_test
-class FillMaskPipelineTests(unittest.TestCase):
-    model_mapping = MODEL_FOR_MASKED_LM_MAPPING
-
-    def tearDown(self):
-        super().tearDown()
-        # clean-up as much as possible GPU memory occupied by PyTorch
-        gc.collect()
-        if is_mindspore_available():
-            backend_empty_cache()
-
-
-    @require_mindspore
-    def test_small_model(self):
-        unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", top_k=2)
-
-        outputs = unmasker("My name is <mask>")
-        self.assertEqual(
-            nested_simplify(outputs, decimals=6),
-            [
-                {"sequence": "My name is Maul", "score": 2.2e-05, "token": 35676, "token_str": " Maul"},
-                {"sequence": "My name isELS", "score": 2.2e-05, "token": 16416, "token_str": "ELS"},
-            ],
-        )
-
-        outputs = unmasker("The largest city in France is <mask>")
-        self.assertEqual(
-            nested_simplify(outputs, decimals=6),
-            [
-                {
-                    "sequence": "The largest city in France is Maul",
-                    "score": 2.2e-05,
-                    "token": 35676,
-                    "token_str": " Maul",
-                },
-                {"sequence": "The largest city in France isELS", "score": 2.2e-05, "token": 16416, "token_str": "ELS"},
-            ],
-        )
-
-        outputs = unmasker("My name is <mask>", targets=[" Patrick", " Clara", " Teven"], top_k=3)
-        self.assertEqual(
-            nested_simplify(outputs, decimals=6),
-            [
-                {"sequence": "My name is Patrick", "score": 2.1e-05, "token": 3499, "token_str": " Patrick"},
-                {"sequence": "My name is Te", "score": 2e-05, "token": 2941, "token_str": " Te"},
-                {"sequence": "My name is Clara", "score": 2e-05, "token": 13606, "token_str": " Clara"},
-            ],
-        )
-
-        outputs = unmasker("My name is <mask> <mask>", top_k=2)
-
-        self.assertEqual(
-            nested_simplify(outputs, decimals=6),
-            [
-                [
-                    {
-                        "score": 2.2e-05,
-                        "token": 35676,
-                        "token_str": " Maul",
-                        "sequence": "<s>My name is Maul<mask></s>",
-                    },
-                    {"score": 2.2e-05, "token": 16416, "token_str": "ELS", "sequence": "<s>My name isELS<mask></s>"},
-                ],
-                [
-                    {
-                        "score": 2.2e-05,
-                        "token": 35676,
-                        "token_str": " Maul",
-                        "sequence": "<s>My name is<mask> Maul</s>",
-                    },
-                    {"score": 2.2e-05, "token": 16416, "token_str": "ELS", "sequence": "<s>My name is<mask>ELS</s>"},
-                ],
-            ],
-        )
-
-    @require_mindspore
-    def test_fp16_casting(self):
-        pipe = pipeline(
-            "fill-mask",
-            model="hf-internal-testing/tiny-random-distilbert",
-        )
-
-        # convert model to fp16
-        pipe.model.half()
-
-        response = pipe("Paris is the [MASK] of France.")
-        # We actually don't care about the result, we just want to make sure
-        # it works, meaning the float16 tensor got casted back to float32
-        # for postprocessing.
-        self.assertIsInstance(response, list)
-
-    @slow
-    @require_mindspore
-    def test_large_model(self):
-        unmasker = pipeline(task="fill-mask", model="distilbert/distilroberta-base", top_k=2)
-        self.run_large_test(unmasker)
-
-
-    def run_large_test(self, unmasker):
-        outputs = unmasker("My name is <mask>")
-        self.assertEqual(
-            nested_simplify(outputs),
-            [
-                {"sequence": "My name is John", "score": 0.008, "token": 610, "token_str": " John"},
-                {"sequence": "My name is Chris", "score": 0.007, "token": 1573, "token_str": " Chris"},
-            ],
-        )
-        outputs = unmasker("The largest city in France is <mask>")
-        self.assertEqual(
-            nested_simplify(outputs),
-            [
-                {
-                    "sequence": "The largest city in France is Paris",
-                    "score": 0.251,
-                    "token": 2201,
-                    "token_str": " Paris",
-                },
-                {
-                    "sequence": "The largest city in France is Lyon",
-                    "score": 0.214,
-                    "token": 12790,
-                    "token_str": " Lyon",
-                },
-            ],
-        )
-
-        outputs = unmasker("My name is <mask>", targets=[" Patrick", " Clara", " Teven"], top_k=3)
-        self.assertEqual(
-            nested_simplify(outputs),
-            [
-                {"sequence": "My name is Patrick", "score": 0.005, "token": 3499, "token_str": " Patrick"},
-                {"sequence": "My name is Clara", "score": 0.000, "token": 13606, "token_str": " Clara"},
-                {"sequence": "My name is Te", "score": 0.000, "token": 2941, "token_str": " Te"},
-            ],
-        )
-
-        dummy_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit," * 100
-        outputs = unmasker(
-            "My name is <mask>" + dummy_str,
-            tokenizer_kwargs={"truncation": True},
-        )
-        simplified = nested_simplify(outputs, decimals=4)
-        self.assertEqual(
-            [{"sequence": x["sequence"][:100]} for x in simplified],
-            [
-                {"sequence": f"My name is,{dummy_str}"[:100]},
-                {"sequence": f"My name is:,{dummy_str}"[:100]},
-            ],
-        )
-        self.assertEqual(
-            [{k: x[k] for k in x if k != "sequence"} for x in simplified],
-            [
-                {"score": 0.2819, "token": 6, "token_str": ","},
-                {"score": 0.0954, "token": 46686, "token_str": ":,"},
-            ],
-        )
-
-    @require_mindspore
-    def test_model_no_pad(self):
-        unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base")
-        unmasker.tokenizer.pad_token_id = None
-        unmasker.tokenizer.pad_token = None
-        self.run_pipeline_test(unmasker, [])
-
-    def get_test_pipeline(
-        self,
-        model,
-        tokenizer=None,
-        image_processor=None,
-        feature_extractor=None,
-        processor=None,
-        torch_dtype="float32",
-    ):
-        if tokenizer is None or tokenizer.mask_token_id is None:
-            self.skipTest(reason="The provided tokenizer has no mask token, (probably reformer or wav2vec2)")
-
-        fill_masker = FillMaskPipeline(
-            model=model,
-            tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
-            image_processor=image_processor,
-            processor=processor,
-            torch_dtype=torch_dtype,
-        )
-        examples = [
-            f"This is another {tokenizer.mask_token} test",
-        ]
-        return fill_masker, examples
-
-    def run_pipeline_test(self, fill_masker, examples):
-        tokenizer = fill_masker.tokenizer
-        model = fill_masker.model
-
-        outputs = fill_masker(
-            f"This is a {tokenizer.mask_token}",
-        )
-        self.assertEqual(
-            outputs,
-            [
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-            ],
-        )
-
-        outputs = fill_masker([f"This is a {tokenizer.mask_token}"])
-        self.assertEqual(
-            outputs,
-            [
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-            ],
-        )
-
-        outputs = fill_masker([f"This is a {tokenizer.mask_token}", f"Another {tokenizer.mask_token} great test."])
-        self.assertEqual(
-            outputs,
-            [
-                [
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                ],
-                [
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                ],
-            ],
-        )
-
-        with self.assertRaises(ValueError):
-            fill_masker([None])
-        # No mask_token is not supported
-        with self.assertRaises(PipelineException):
-            fill_masker("This is")
-
-        self.run_test_top_k(model, tokenizer)
-        self.run_test_targets(model, tokenizer)
-        self.run_test_top_k_targets(model, tokenizer)
-        self.fill_mask_with_duplicate_targets_and_top_k(model, tokenizer)
-        self.fill_mask_with_multiple_masks(model, tokenizer)
-
-    def run_test_targets(self, model, tokenizer):
-        vocab = tokenizer.get_vocab()
-        targets = sorted(vocab.keys())[:2]
-        # Pipeline argument
-        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer, targets=targets)
-        outputs = fill_masker(f"This is a {tokenizer.mask_token}")
-        self.assertEqual(
-            outputs,
-            [
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-            ],
-        )
-        target_ids = {vocab[el] for el in targets}
-        self.assertEqual({el["token"] for el in outputs}, target_ids)
-        processed_targets = [tokenizer.decode([x]) for x in target_ids]
-        self.assertEqual({el["token_str"] for el in outputs}, set(processed_targets))
-
-        # Call argument
-        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
-        outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=targets)
-        self.assertEqual(
-            outputs,
-            [
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-            ],
-        )
-        target_ids = {vocab[el] for el in targets}
-        self.assertEqual({el["token"] for el in outputs}, target_ids)
-        processed_targets = [tokenizer.decode([x]) for x in target_ids]
-        self.assertEqual({el["token_str"] for el in outputs}, set(processed_targets))
-
-        # Score equivalence
-        outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=targets)
-        tokens = [top_mask["token_str"] for top_mask in outputs]
-        scores = [top_mask["score"] for top_mask in outputs]
-
-        # For some BPE tokenizers, `</w>` is removed during decoding, so `token_str` won't be the same as in `targets`.
-        if set(tokens) == set(targets):
-            unmasked_targets = fill_masker(f"This is a {tokenizer.mask_token}", targets=tokens)
-            target_scores = [top_mask["score"] for top_mask in unmasked_targets]
-            self.assertEqual(nested_simplify(scores), nested_simplify(target_scores))
-
-        # Raises with invalid
-        with self.assertRaises(ValueError):
-            outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=[])
-        # For some tokenizers, `""` is actually in the vocabulary and the expected error won't raised
-        if "" not in tokenizer.get_vocab():
-            with self.assertRaises(ValueError):
-                outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=[""])
-            with self.assertRaises(ValueError):
-                outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets="")
-
-    def run_test_top_k(self, model, tokenizer):
-        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer, top_k=2)
-        outputs = fill_masker(f"This is a {tokenizer.mask_token}")
-        self.assertEqual(
-            outputs,
-            [
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-            ],
-        )
-
-        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
-        outputs2 = fill_masker(f"This is a {tokenizer.mask_token}", top_k=2)
-        self.assertEqual(
-            outputs2,
-            [
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-            ],
-        )
-        self.assertEqual(nested_simplify(outputs), nested_simplify(outputs2))
-
-    def run_test_top_k_targets(self, model, tokenizer):
-        vocab = tokenizer.get_vocab()
-        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
-
-        # top_k=2, ntargets=3
-        targets = sorted(vocab.keys())[:3]
-        outputs = fill_masker(f"This is a {tokenizer.mask_token}", top_k=2, targets=targets)
-
-        # If we use the most probably targets, and filter differently, we should still
-        # have the same results
-        targets2 = [el["token_str"] for el in sorted(outputs, key=lambda x: x["score"], reverse=True)]
-        # For some BPE tokenizers, `</w>` is removed during decoding, so `token_str` won't be the same as in `targets`.
-        if set(targets2).issubset(targets):
-            outputs2 = fill_masker(f"This is a {tokenizer.mask_token}", top_k=3, targets=targets2)
-            # They should yield exactly the same result
-            self.assertEqual(nested_simplify(outputs), nested_simplify(outputs2))
-
-    def fill_mask_with_duplicate_targets_and_top_k(self, model, tokenizer):
-        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
-        vocab = tokenizer.get_vocab()
-        # String duplicates + id duplicates
-        targets = sorted(vocab.keys())[:3]
-        targets = [targets[0], targets[1], targets[0], targets[2], targets[1]]
-        outputs = fill_masker(f"My name is {tokenizer.mask_token}", targets=targets, top_k=10)
-
-        # The target list contains duplicates, so we can't output more
-        # than them
-        self.assertEqual(len(outputs), 3)
-
-    def fill_mask_with_multiple_masks(self, model, tokenizer):
-        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
-
-        outputs = fill_masker(
-            f"This is a {tokenizer.mask_token} {tokenizer.mask_token} {tokenizer.mask_token}", top_k=2
-        )
-        self.assertEqual(
-            outputs,
-            [
-                [
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                ],
-                [
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                ],
-                [
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                    {"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
-                ],
-            ],
-        )
\ No newline at end of file
diff --git a/tests/transformers/pipelines/test_pipelines_image_classification.py b/tests/transformers/pipelines/test_pipelines_image_classification.py
deleted file mode 100644
index 6c9463da7..000000000
--- a/tests/transformers/pipelines/test_pipelines_image_classification.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from mindnlp.transformers import (
-    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
-    PreTrainedTokenizerBase
-)
-from mindnlp.utils import (
-    is_mindspore_available,
-    is_vision_available,
-)
-from mindnlp.transformers.pipelines import ImageClassificationPipeline, pipeline
-from mindnlp.utils.testing_utils import (
-    is_pipeline_test,
-    nested_simplify,
-    require_mindspore,
-    require_bfloat16,
-    require_vision,
-    slow,
-)
-
-from .test_pipelines_common import ANY
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import ops
-
-if is_vision_available():
-    from PIL import Image
-else:
-
-    class Image:
-        @staticmethod
-        def open(*args, **kwargs):
-            pass
-
-
-@is_pipeline_test
-@require_mindspore
-@require_vision
-class ImageClassificationPipelineTests(unittest.TestCase):
-    model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
-
-    def get_test_pipeline(
-        self,
-        model,
-        tokenizer=None,
-        image_processor=None,
-        feature_extractor=None,
-        processor=None,
-        ms_dtype="float32",
-    ):
-        image_classifier = ImageClassificationPipeline(
-            model=model,
-            tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
-            image_processor=image_processor,
-            processor=processor,
-            ms_dtype=ms_dtype,
-            top_k=2,
-        )
-        examples = [
-            Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
-            "http://images.cocodataset.org/val2017/000000039769.jpg",
-        ]
-        return image_classifier, examples
-
-    def run_pipeline_test(self, image_classifier, examples):
-        outputs = image_classifier("./tests/fixtures/tests_samples/COCO/000000039769.png")
-
-        self.assertEqual(
-            outputs,
-            [
-                {"score": ANY(float), "label": ANY(str)},
-                {"score": ANY(float), "label": ANY(str)},
-            ],
-        )
-
-        import datasets
-
-        # we use revision="refs/pr/1" until the PR is merged
-        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
-
-        # Accepts URL + PIL.Image + lists
-        outputs = image_classifier(
-            [
-                Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
-                "http://images.cocodataset.org/val2017/000000039769.jpg",
-                # RGBA
-                dataset[0]["image"],
-                # LA
-                dataset[1]["image"],
-                # L
-                dataset[2]["image"],
-            ]
-        )
-        self.assertEqual(
-            outputs,
-            [
-                [
-                    {"score": ANY(float), "label": ANY(str)},
-                    {"score": ANY(float), "label": ANY(str)},
-                ],
-                [
-                    {"score": ANY(float), "label": ANY(str)},
-                    {"score": ANY(float), "label": ANY(str)},
-                ],
-                [
-                    {"score": ANY(float), "label": ANY(str)},
-                    {"score": ANY(float), "label": ANY(str)},
-                ],
-                [
-                    {"score": ANY(float), "label": ANY(str)},
-                    {"score": ANY(float), "label": ANY(str)},
-                ],
-                [
-                    {"score": ANY(float), "label": ANY(str)},
-                    {"score": ANY(float), "label": ANY(str)},
-                ],
-            ],
-        )
-
-
-    @require_mindspore
-    def test_small_model_pt(self):
-        small_model = "hf-internal-testing/tiny-random-vit"
-        image_classifier = pipeline("image-classification", model=small_model)
-
-        outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
-        )
-
-        outputs = image_classifier(
-            [
-                "http://images.cocodataset.org/val2017/000000039769.jpg",
-                "http://images.cocodataset.org/val2017/000000039769.jpg",
-            ],
-            top_k=2,
-        )
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
-                [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
-            ],
-        )
-
-    def test_custom_tokenizer(self):
-        tokenizer = PreTrainedTokenizerBase()
-
-        # Assert that the pipeline can be initialized with a feature extractor that is not in any mapping
-        image_classifier = pipeline(
-            "image-classification", model="hf-internal-testing/tiny-random-vit", tokenizer=tokenizer
-        )
-
-        self.assertIs(image_classifier.tokenizer, tokenizer)
-
-    @require_mindspore
-    def test_ms_float16_pipeline(self):
-        image_classifier = pipeline(
-            "image-classification", model="hf-internal-testing/tiny-random-vit", ms_dtype=mindspore.float16
-        )
-        outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
-
-        self.assertEqual(
-            nested_simplify(outputs, decimals=3),
-            [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
-        )
-
-    @require_mindspore
-    @require_bfloat16
-    def test_ms_bfloat16_pipeline(self):
-        image_classifier = pipeline(
-            "image-classification", model="hf-internal-testing/tiny-random-vit", ms_dtype=mindspore.bfloat16
-        )
-        outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
-
-        self.assertEqual(
-            nested_simplify(outputs, decimals=3),
-            [{"label": "LABEL_1", "score": 0.574}, {"label": "LABEL_0", "score": 0.426}],
-        )
-
-    @slow
-    @require_mindspore
-    def test_perceiver(self):
-        # Perceiver is not tested by `run_pipeline_test` properly.
-        # That is because the type of feature_extractor and model preprocessor need to be kept
-        # in sync, which is not the case in the current design
-        image_classifier = pipeline("image-classification", model="deepmind/vision-perceiver-conv")
-        outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.4385, "label": "tabby, tabby cat"},
-                {"score": 0.321, "label": "tiger cat"},
-                {"score": 0.0502, "label": "Egyptian cat"},
-                {"score": 0.0137, "label": "crib, cot"},
-                {"score": 0.007, "label": "radiator"},
-            ],
-        )
-
-        image_classifier = pipeline("image-classification", model="deepmind/vision-perceiver-fourier")
-        outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.5658, "label": "tabby, tabby cat"},
-                {"score": 0.1309, "label": "tiger cat"},
-                {"score": 0.0722, "label": "Egyptian cat"},
-                {"score": 0.0707, "label": "remote control, remote"},
-                {"score": 0.0082, "label": "computer keyboard, keypad"},
-            ],
-        )
-
-        image_classifier = pipeline("image-classification", model="deepmind/vision-perceiver-learned")
-        outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {"score": 0.3022, "label": "tabby, tabby cat"},
-                {"score": 0.2362, "label": "Egyptian cat"},
-                {"score": 0.1856, "label": "tiger cat"},
-                {"score": 0.0324, "label": "remote control, remote"},
-                {"score": 0.0096, "label": "quilt, comforter, comfort, puff"},
-            ],
-        )
-
-    @slow
-    @require_mindspore
-    def test_multilabel_classification(self):
-        small_model = "hf-internal-testing/tiny-random-vit"
-
-        # Sigmoid is applied for multi-label classification
-        image_classifier = pipeline("image-classification", model=small_model)
-        image_classifier.model.config.problem_type = "multi_label_classification"
-
-        outputs = image_classifier("http://images.cocodataset.org/val2017/000000039769.jpg")
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [{"label": "LABEL_1", "score": 0.5356}, {"label": "LABEL_0", "score": 0.4612}],
-        )
-
-        outputs = image_classifier(
-            [
-                "http://images.cocodataset.org/val2017/000000039769.jpg",
-                "http://images.cocodataset.org/val2017/000000039769.jpg",
-            ]
-        )
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                [{"label": "LABEL_1", "score": 0.5356}, {"label": "LABEL_0", "score": 0.4612}],
-                [{"label": "LABEL_1", "score": 0.5356}, {"label": "LABEL_0", "score": 0.4612}],
-            ],
-        )
-
-    @slow
-    @require_mindspore
-    def test_function_to_apply(self):
-        small_model = "hf-internal-testing/tiny-random-vit"
-
-        # Sigmoid is applied for multi-label classification
-        image_classifier = pipeline("image-classification", model=small_model)
-
-        outputs = image_classifier(
-            "http://images.cocodataset.org/val2017/000000039769.jpg",
-            function_to_apply="sigmoid",
-        )
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [{"label": "LABEL_1", "score": 0.5356}, {"label": "LABEL_0", "score": 0.4612}],
-        )
\ No newline at end of file
diff --git a/tests/transformers/pipelines/test_pipelines_image_feature_extraction.py b/tests/transformers/pipelines/test_pipelines_image_feature_extraction.py
deleted file mode 100644
index 1adc436af..000000000
--- a/tests/transformers/pipelines/test_pipelines_image_feature_extraction.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import pytest
-
-from mindnlp.transformers import (
-    MODEL_MAPPING,
-    TOKENIZER_MAPPING,
-    ImageFeatureExtractionPipeline,
-    pipeline,
-)
-from mindnlp.utils import (
-    is_mindspore_available,
-    is_vision_available,
-)
-from mindnlp.utils.testing_utils import is_pipeline_test, nested_simplify, require_mindspore
-
-
-if is_mindspore_available():
-    from mindnlp.core import ops
-
-
-if is_vision_available():
-    from PIL import Image
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-@is_pipeline_test
-class ImageFeatureExtractionPipelineTests(unittest.TestCase):
-    model_mapping = MODEL_MAPPING
-
-    @require_mindspore
-    def test_small_model(self):
-        feature_extractor = pipeline(
-            task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit"
-        )
-        img = prepare_img()
-        outputs = feature_extractor(img)
-        self.assertEqual(
-            nested_simplify(outputs[0][0]),
-            [-1.417, -0.392, -1.264, -1.196, 1.648, 0.885, 0.56, -0.606, -1.175, 0.823, 1.912, 0.081, -0.053, 1.119, -0.062, -1.757, -0.571, 0.075, 0.959, 0.118, 1.201, -0.672, -0.498, 0.364, 0.937, -1.623, 0.228, 0.19, 1.697, -1.115, 0.583, -0.981])  # fmt: skip
-
-    @require_mindspore
-    def test_small_model_w_pooler(self):
-        feature_extractor = pipeline(
-            task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit-w-pooler"
-        )
-        img = prepare_img()
-        outputs = feature_extractor(img, pool=True)
-        self.assertEqual(
-            nested_simplify(outputs[0]),
-            [-0.056,  0.083,  0.021,  0.038,  0.242, -0.279, -0.033, -0.003, 0.200, -0.192,  0.045, -0.095, -0.077,  0.017, -0.058, -0.063, -0.029, -0.204,  0.014,  0.042,  0.305, -0.205, -0.099,  0.146, -0.287,  0.020,  0.168, -0.052,  0.046,  0.048, -0.156,  0.093])  # fmt: skip
-
-    @require_mindspore
-    def test_image_processing_small_model(self):
-        feature_extractor = pipeline(
-            task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit"
-        )
-
-        # test with image processor parameters
-        image_processor_kwargs = {"size": {"height": 300, "width": 300}}
-        img = prepare_img()
-        with pytest.raises(ValueError):
-            # Image doesn't match model input size
-            feature_extractor(img, image_processor_kwargs=image_processor_kwargs)
-
-        image_processor_kwargs = {"image_mean": [0, 0, 0], "image_std": [1, 1, 1]}
-        img = prepare_img()
-        outputs = feature_extractor(img, image_processor_kwargs=image_processor_kwargs)
-        self.assertEqual(np.squeeze(outputs).shape, (226, 32))
-
-        # Test pooling option
-        outputs = feature_extractor(img, pool=True)
-        self.assertEqual(np.squeeze(outputs).shape, (32,))
-
-    @require_mindspore
-    def test_return_tensors(self):
-        feature_extractor = pipeline(
-            task="image-feature-extraction", model="hf-internal-testing/tiny-random-vit"
-        )
-        img = prepare_img()
-        outputs = feature_extractor(img, return_tensors=True)
-        self.assertTrue(ops.is_tensor(outputs))
-
-    def get_test_pipeline(
-        self,
-        model,
-        tokenizer=None,
-        image_processor=None,
-        feature_extractor=None,
-        processor=None,
-        ms_dtype="float32",
-    ):
-        if image_processor is None:
-            self.skipTest(reason="No image processor")
-
-        elif type(model.config) in TOKENIZER_MAPPING:
-            self.skipTest(
-                reason="This is a bimodal model, we need to find a more consistent way to switch on those models."
-            )
-
-        elif model.config.is_encoder_decoder:
-            self.skipTest(
-                """encoder_decoder models are trickier for this pipeline.
-                Do we want encoder + decoder inputs to get some featues?
-                Do we want encoder only features ?
-                For now ignore those.
-                """
-            )
-
-        feature_extractor_pipeline = ImageFeatureExtractionPipeline(
-            model=model,
-            tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
-            image_processor=image_processor,
-            processor=processor,
-            ms_dtype=ms_dtype,
-        )
-        img = prepare_img()
-        return feature_extractor_pipeline, [img, img]
-
-    def run_pipeline_test(self, feature_extractor, examples):
-        imgs = examples
-        outputs = feature_extractor(imgs[0])
-
-        self.assertEqual(len(outputs), 1)
-
-        outputs = feature_extractor(imgs)
-        self.assertEqual(len(outputs), 2)
\ No newline at end of file
diff --git a/tests/transformers/pipelines/test_pipelines_image_segmentation.py b/tests/transformers/pipelines/test_pipelines_image_segmentation.py
deleted file mode 100644
index 476a6486c..000000000
--- a/tests/transformers/pipelines/test_pipelines_image_segmentation.py
+++ /dev/null
@@ -1,740 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-import unittest
-from typing import Dict
-
-import datasets
-import numpy as np
-import requests
-from datasets import load_dataset
-from huggingface_hub.utils import insecure_hashlib
-
-from mindnlp.transformers import (
-    MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
-    MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
-    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
-    AutoImageProcessor,
-    AutoModelForImageSegmentation,
-    AutoModelForInstanceSegmentation,
-    DetrForSegmentation,
-    ImageSegmentationPipeline,
-    MaskFormerForInstanceSegmentation,
-    pipeline,
-)
-from mindnlp.utils import is_vision_available
-from mindnlp.utils.testing_utils import (
-    is_pipeline_test,
-    nested_simplify,
-    require_mindspore,
-    require_vision,
-    slow,
-)
-
-from .test_pipelines_common import ANY
-
-
-if is_vision_available():
-    from PIL import Image
-else:
-
-    class Image:
-        @staticmethod
-        def open(*args, **kwargs):
-            pass
-
-
-def hashimage(image: Image) -> str:
-    m = insecure_hashlib.md5(image.tobytes())
-    return m.hexdigest()[:10]
-
-
-def mask_to_test_readable(mask: Image) -> Dict:
-    npimg = np.array(mask)
-    white_pixels = (npimg == 255).sum()
-    shape = npimg.shape
-    return {"hash": hashimage(mask), "white_pixels": white_pixels, "shape": shape}
-
-
-def mask_to_test_readable_only_shape(mask: Image) -> Dict:
-    npimg = np.array(mask)
-    shape = npimg.shape
-    return {"shape": shape}
-
-
-@is_pipeline_test
-@require_vision
-@require_mindspore
-class ImageSegmentationPipelineTests(unittest.TestCase):
-    model_mapping = dict(
-        (list(MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items()) if MODEL_FOR_IMAGE_SEGMENTATION_MAPPING else [])
-        + (MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() if MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING else [])
-        + (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else [])
-    )
-
-    def get_test_pipeline(
-        self,
-        model,
-        tokenizer=None,
-        image_processor=None,
-        feature_extractor=None,
-        processor=None,
-        ms_dtype="float32",
-    ):
-        image_segmenter = ImageSegmentationPipeline(
-            model=model,
-            tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
-            image_processor=image_processor,
-            processor=processor,
-            ms_dtype=ms_dtype,
-        )
-        return image_segmenter, [
-            "./tests/fixtures/tests_samples/COCO/000000039769.png",
-            "./tests/fixtures/tests_samples/COCO/000000039769.png",
-        ]
-
-    def run_pipeline_test(self, image_segmenter, examples):
-        outputs = image_segmenter(
-            "./tests/fixtures/tests_samples/COCO/000000039769.png",
-            threshold=0.0,
-            mask_threshold=0,
-            overlap_mask_area_threshold=0,
-        )
-        self.assertIsInstance(outputs, list)
-        n = len(outputs)
-        if isinstance(image_segmenter.model, (MaskFormerForInstanceSegmentation, DetrForSegmentation)):
-            # Instance segmentation (maskformer, and detr) have a slot for null class
-            # and can output nothing even with a low threshold
-            self.assertGreaterEqual(n, 0)
-        else:
-            self.assertGreaterEqual(n, 1)
-        # XXX: PIL.Image implements __eq__ which bypasses ANY, so we inverse the comparison
-        # to make it work
-        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, outputs)
-
-        # we use revision="refs/pr/1" until the PR is merged
-        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
-
-        # RGBA
-        outputs = image_segmenter(dataset[0]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0)
-        m = len(outputs)
-        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
-        # LA
-        outputs = image_segmenter(dataset[1]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0)
-        m = len(outputs)
-        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
-        # L
-        outputs = image_segmenter(dataset[2]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0)
-        m = len(outputs)
-        self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
-
-        if isinstance(image_segmenter.model, DetrForSegmentation):
-            # We need to test batch_size with images with the same size.
-            # Detr doesn't normalize the size of the images, meaning we can have
-            # 800x800 or 800x1200, meaning we cannot batch simply.
-            # We simply bail on this
-            batch_size = 1
-        else:
-            batch_size = 2
-
-        # 5 times the same image so the output shape is predictable
-        batch = [
-            "./tests/fixtures/tests_samples/COCO/000000039769.png",
-            "./tests/fixtures/tests_samples/COCO/000000039769.png",
-            "./tests/fixtures/tests_samples/COCO/000000039769.png",
-            "./tests/fixtures/tests_samples/COCO/000000039769.png",
-            "./tests/fixtures/tests_samples/COCO/000000039769.png",
-        ]
-        outputs = image_segmenter(
-            batch,
-            threshold=0.0,
-            mask_threshold=0,
-            overlap_mask_area_threshold=0,
-            batch_size=batch_size,
-        )
-        self.assertEqual(len(batch), len(outputs))
-        self.assertEqual(len(outputs[0]), n)
-        self.assertEqual(
-            [
-                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
-                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
-                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
-                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
-                [{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n,
-            ],
-            outputs,
-            f"Expected [{n}, {n}, {n}, {n}, {n}], got {[len(item) for item in outputs]}",
-        )
-
-    @require_mindspore
-    def test_small_model_no_panoptic(self):
-        model_id = "hf-internal-testing/tiny-random-mobilevit"
-        # The default task is `image-classification` we need to override
-        pipe = pipeline(task="image-segmentation", model=model_id)
-
-        # This model does NOT support neither `instance` nor  `panoptic`
-        # We should error out
-        with self.assertRaises(ValueError) as e:
-            pipe("http://images.cocodataset.org/val2017/000000039769.jpg", subtask="panoptic")
-        self.assertEqual(
-            str(e.exception),
-            "Subtask panoptic is not supported for model <class"
-            " 'mindnlp.transformers.models.mobilevit.modeling_mobilevit.MobileViTForSemanticSegmentation'>",
-        )
-        with self.assertRaises(ValueError) as e:
-            pipe("http://images.cocodataset.org/val2017/000000039769.jpg", subtask="instance")
-        self.assertEqual(
-            str(e.exception),
-            "Subtask instance is not supported for model <class"
-            " 'mindnlp.transformers.models.mobilevit.modeling_mobilevit.MobileViTForSemanticSegmentation'>",
-        )
-
-    @require_mindspore
-    def test_small_model(self):
-        model_id = "hf-internal-testing/tiny-detr-mobilenetsv3-panoptic"
-
-        model = AutoModelForImageSegmentation.from_pretrained(model_id)
-        image_processor = AutoImageProcessor.from_pretrained(model_id)
-        image_segmenter = ImageSegmentationPipeline(
-            model=model,
-            image_processor=image_processor,
-            subtask="panoptic",
-            threshold=0.0,
-            mask_threshold=0.0,
-            overlap_mask_area_threshold=0.0,
-        )
-
-        outputs = image_segmenter(
-            "http://images.cocodataset.org/val2017/000000039769.jpg",
-        )
-
-        # Shortening by hashing
-        for o in outputs:
-            o["mask"] = mask_to_test_readable(o["mask"])
-
-        # This is extremely brittle, and those values are made specific for the CI.
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {
-                    "score": 0.004,
-                    "label": "LABEL_215",
-                    "mask": {"hash": "a01498ca7c", "shape": (480, 640), "white_pixels": 307200},
-                },
-            ],
-        )
-
-        outputs = image_segmenter(
-            [
-                "http://images.cocodataset.org/val2017/000000039769.jpg",
-                "http://images.cocodataset.org/val2017/000000039769.jpg",
-            ],
-        )
-        for output in outputs:
-            for o in output:
-                o["mask"] = mask_to_test_readable(o["mask"])
-
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                [
-                    {
-                        "score": 0.004,
-                        "label": "LABEL_215",
-                        "mask": {"hash": "a01498ca7c", "shape": (480, 640), "white_pixels": 307200},
-                    },
-                ],
-                [
-                    {
-                        "score": 0.004,
-                        "label": "LABEL_215",
-                        "mask": {"hash": "a01498ca7c", "shape": (480, 640), "white_pixels": 307200},
-                    },
-                ],
-            ],
-        )
-
-        output = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", subtask="instance")
-        for o in output:
-            o["mask"] = mask_to_test_readable(o["mask"])
-        self.assertEqual(
-            nested_simplify(output, decimals=4),
-            [
-                {
-                    "score": 0.004,
-                    "label": "LABEL_215",
-                    "mask": {"hash": "a01498ca7c", "shape": (480, 640), "white_pixels": 307200},
-                },
-            ],
-        )
-
-        # This must be surprising to the reader.
-        # The `panoptic` returns only LABEL_215, and this returns 3 labels.
-        #
-        output = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", subtask="semantic")
-
-        output_masks = [o["mask"] for o in output]
-
-        # page links (to visualize)
-        expected_masks = [
-            "https://huggingface.co/datasets/hf-internal-testing/mask-for-image-segmentation-tests/blob/main/mask_0.png",
-            "https://huggingface.co/datasets/hf-internal-testing/mask-for-image-segmentation-tests/blob/main/mask_1.png",
-            "https://huggingface.co/datasets/hf-internal-testing/mask-for-image-segmentation-tests/blob/main/mask_2.png",
-        ]
-        # actual links to get files
-        expected_masks = [x.replace("/blob/", "/resolve/") for x in expected_masks]
-        expected_masks = [Image.open(requests.get(image, stream=True).raw) for image in expected_masks]
-
-        # Convert masks to numpy array
-        output_masks = [np.array(x) for x in output_masks]
-        expected_masks = [np.array(x) for x in expected_masks]
-
-        self.assertEqual(output_masks[0].shape, expected_masks[0].shape)
-        self.assertEqual(output_masks[1].shape, expected_masks[1].shape)
-        self.assertEqual(output_masks[2].shape, expected_masks[2].shape)
-
-        # With un-trained tiny random models, the output `logits` tensor is very likely to contain many values
-        # close to each other, which cause `argmax` to give quite different results when running the test on 2
-        # environments. We use a lower threshold `0.9` here to avoid flakiness.
-        self.assertGreaterEqual(np.mean(output_masks[0] == expected_masks[0]), 0.9)
-        self.assertGreaterEqual(np.mean(output_masks[1] == expected_masks[1]), 0.9)
-        self.assertGreaterEqual(np.mean(output_masks[2] == expected_masks[2]), 0.9)
-
-        for o in output:
-            o["mask"] = mask_to_test_readable_only_shape(o["mask"])
-        self.maxDiff = None
-        self.assertEqual(
-            nested_simplify(output, decimals=4),
-            [
-                {
-                    "label": "LABEL_88",
-                    "mask": {"shape": (480, 640)},
-                    "score": None,
-                },
-                {
-                    "label": "LABEL_101",
-                    "mask": {"shape": (480, 640)},
-                    "score": None,
-                },
-                {
-                    "label": "LABEL_215",
-                    "mask": {"shape": (480, 640)},
-                    "score": None,
-                },
-            ],
-        )
-
-    @require_mindspore
-    def test_small_model_semantic(self):
-        model_id = "hf-internal-testing/tiny-random-beit-pipeline"
-        image_segmenter = pipeline(model=model_id)
-        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg")
-        for o in outputs:
-            # shortening by hashing
-            o["mask"] = mask_to_test_readable(o["mask"])
-
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {
-                    "score": None,
-                    "label": "LABEL_0",
-                    "mask": {"hash": "42d0907228", "shape": (480, 640), "white_pixels": 10714},
-                },
-                {
-                    "score": None,
-                    "label": "LABEL_1",
-                    "mask": {"hash": "46b8cc3976", "shape": (480, 640), "white_pixels": 296486},
-                },
-            ],
-        )
-
-    @require_mindspore
-    @slow
-    def test_integration_torch_image_segmentation(self):
-        model_id = "facebook/detr-resnet-50-panoptic"
-        image_segmenter = pipeline(
-            "image-segmentation",
-            model=model_id,
-            threshold=0.0,
-            overlap_mask_area_threshold=0.0,
-        )
-
-        outputs = image_segmenter(
-            "http://images.cocodataset.org/val2017/000000039769.jpg",
-        )
-
-        # Shortening by hashing
-        for o in outputs:
-            o["mask"] = mask_to_test_readable(o["mask"])
-
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {
-                    "score": 0.9094,
-                    "label": "blanket",
-                    "mask": {"hash": "dcff19a97a", "shape": (480, 640), "white_pixels": 16617},
-                },
-                {
-                    "score": 0.9941,
-                    "label": "cat",
-                    "mask": {"hash": "9c0af87bd0", "shape": (480, 640), "white_pixels": 59185},
-                },
-                {
-                    "score": 0.9987,
-                    "label": "remote",
-                    "mask": {"hash": "c7870600d6", "shape": (480, 640), "white_pixels": 4182},
-                },
-                {
-                    "score": 0.9995,
-                    "label": "remote",
-                    "mask": {"hash": "ef899a25fd", "shape": (480, 640), "white_pixels": 2275},
-                },
-                {
-                    "score": 0.9722,
-                    "label": "couch",
-                    "mask": {"hash": "37b8446ac5", "shape": (480, 640), "white_pixels": 172380},
-                },
-                {
-                    "score": 0.9994,
-                    "label": "cat",
-                    "mask": {"hash": "6a09d3655e", "shape": (480, 640), "white_pixels": 52561},
-                },
-            ],
-        )
-
-        outputs = image_segmenter(
-            [
-                "http://images.cocodataset.org/val2017/000000039769.jpg",
-                "http://images.cocodataset.org/val2017/000000039769.jpg",
-            ],
-        )
-
-        # Shortening by hashing
-        for output in outputs:
-            for o in output:
-                o["mask"] = mask_to_test_readable(o["mask"])
-
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                [
-                    {
-                        "score": 0.9094,
-                        "label": "blanket",
-                        "mask": {"hash": "dcff19a97a", "shape": (480, 640), "white_pixels": 16617},
-                    },
-                    {
-                        "score": 0.9941,
-                        "label": "cat",
-                        "mask": {"hash": "9c0af87bd0", "shape": (480, 640), "white_pixels": 59185},
-                    },
-                    {
-                        "score": 0.9987,
-                        "label": "remote",
-                        "mask": {"hash": "c7870600d6", "shape": (480, 640), "white_pixels": 4182},
-                    },
-                    {
-                        "score": 0.9995,
-                        "label": "remote",
-                        "mask": {"hash": "ef899a25fd", "shape": (480, 640), "white_pixels": 2275},
-                    },
-                    {
-                        "score": 0.9722,
-                        "label": "couch",
-                        "mask": {"hash": "37b8446ac5", "shape": (480, 640), "white_pixels": 172380},
-                    },
-                    {
-                        "score": 0.9994,
-                        "label": "cat",
-                        "mask": {"hash": "6a09d3655e", "shape": (480, 640), "white_pixels": 52561},
-                    },
-                ],
-                [
-                    {
-                        "score": 0.9094,
-                        "label": "blanket",
-                        "mask": {"hash": "dcff19a97a", "shape": (480, 640), "white_pixels": 16617},
-                    },
-                    {
-                        "score": 0.9941,
-                        "label": "cat",
-                        "mask": {"hash": "9c0af87bd0", "shape": (480, 640), "white_pixels": 59185},
-                    },
-                    {
-                        "score": 0.9987,
-                        "label": "remote",
-                        "mask": {"hash": "c7870600d6", "shape": (480, 640), "white_pixels": 4182},
-                    },
-                    {
-                        "score": 0.9995,
-                        "label": "remote",
-                        "mask": {"hash": "ef899a25fd", "shape": (480, 640), "white_pixels": 2275},
-                    },
-                    {
-                        "score": 0.9722,
-                        "label": "couch",
-                        "mask": {"hash": "37b8446ac5", "shape": (480, 640), "white_pixels": 172380},
-                    },
-                    {
-                        "score": 0.9994,
-                        "label": "cat",
-                        "mask": {"hash": "6a09d3655e", "shape": (480, 640), "white_pixels": 52561},
-                    },
-                ],
-            ],
-        )
-
-    @require_mindspore
-    @slow
-    def test_threshold(self):
-        model_id = "facebook/detr-resnet-50-panoptic"
-        image_segmenter = pipeline("image-segmentation", model=model_id)
-
-        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.999)
-        # Shortening by hashing
-        for o in outputs:
-            o["mask"] = mask_to_test_readable(o["mask"])
-
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {
-                    "score": 0.9995,
-                    "label": "remote",
-                    "mask": {"hash": "d02404f578", "shape": (480, 640), "white_pixels": 2789},
-                },
-                {
-                    "score": 0.9994,
-                    "label": "cat",
-                    "mask": {"hash": "eaa115b40c", "shape": (480, 640), "white_pixels": 304411},
-                },
-            ],
-        )
-
-        outputs = image_segmenter("http://images.cocodataset.org/val2017/000000039769.jpg", threshold=0.5)
-
-        for o in outputs:
-            o["mask"] = mask_to_test_readable(o["mask"])
-
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {
-                    "score": 0.9941,
-                    "label": "cat",
-                    "mask": {"hash": "9c0af87bd0", "shape": (480, 640), "white_pixels": 59185},
-                },
-                {
-                    "score": 0.9987,
-                    "label": "remote",
-                    "mask": {"hash": "c7870600d6", "shape": (480, 640), "white_pixels": 4182},
-                },
-                {
-                    "score": 0.9995,
-                    "label": "remote",
-                    "mask": {"hash": "ef899a25fd", "shape": (480, 640), "white_pixels": 2275},
-                },
-                {
-                    "score": 0.9722,
-                    "label": "couch",
-                    "mask": {"hash": "37b8446ac5", "shape": (480, 640), "white_pixels": 172380},
-                },
-                {
-                    "score": 0.9994,
-                    "label": "cat",
-                    "mask": {"hash": "6a09d3655e", "shape": (480, 640), "white_pixels": 52561},
-                },
-            ],
-        )
-
-    @require_mindspore
-    @slow
-    def test_maskformer(self):
-        threshold = 0.8
-        model_id = "facebook/maskformer-swin-base-ade"
-
-        model = AutoModelForInstanceSegmentation.from_pretrained(model_id)
-        image_processor = AutoImageProcessor.from_pretrained(model_id)
-
-        image_segmenter = pipeline("image-segmentation", model=model, image_processor=image_processor)
-
-        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-        file = image[0]["file"]
-        outputs = image_segmenter(file, threshold=threshold)
-
-        # Shortening by hashing
-        for o in outputs:
-            o["mask"] = mask_to_test_readable(o["mask"])
-
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {
-                    "score": 0.9974,
-                    "label": "wall",
-                    "mask": {"hash": "a547b7c062", "shape": (512, 683), "white_pixels": 14252},
-                },
-                {
-                    "score": 0.949,
-                    "label": "house",
-                    "mask": {"hash": "0da9b7b38f", "shape": (512, 683), "white_pixels": 132177},
-                },
-                {
-                    "score": 0.9995,
-                    "label": "grass",
-                    "mask": {"hash": "1d07ea0a26", "shape": (512, 683), "white_pixels": 53444},
-                },
-                {
-                    "score": 0.9976,
-                    "label": "tree",
-                    "mask": {"hash": "6cdc97c7da", "shape": (512, 683), "white_pixels": 7944},
-                },
-                {
-                    "score": 0.8239,
-                    "label": "plant",
-                    "mask": {"hash": "1ab4ce378f", "shape": (512, 683), "white_pixels": 4136},
-                },
-                {
-                    "score": 0.9942,
-                    "label": "road, route",
-                    "mask": {"hash": "39c5d17be5", "shape": (512, 683), "white_pixels": 1941},
-                },
-                {
-                    "score": 1.0,
-                    "label": "sky",
-                    "mask": {"hash": "a3756324a6", "shape": (512, 683), "white_pixels": 135802},
-                },
-            ],
-        )
-
-    @require_mindspore
-    @slow
-    def test_oneformer(self):
-        image_segmenter = pipeline(model="shi-labs/oneformer_ade20k_swin_tiny")
-
-        image = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-        file = image[0]["file"]
-        outputs = image_segmenter(file, threshold=0.99)
-        # Shortening by hashing
-        for o in outputs:
-            o["mask"] = mask_to_test_readable(o["mask"])
-
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {
-                    "score": 0.9981,
-                    "label": "grass",
-                    "mask": {"hash": "3a92904d4c", "white_pixels": 118131, "shape": (512, 683)},
-                },
-                {
-                    "score": 0.9992,
-                    "label": "sky",
-                    "mask": {"hash": "fa2300cc9a", "white_pixels": 231565, "shape": (512, 683)},
-                },
-            ],
-        )
-
-        # Different task
-        outputs = image_segmenter(file, threshold=0.99, subtask="instance")
-        # Shortening by hashing
-        for o in outputs:
-            o["mask"] = mask_to_test_readable(o["mask"])
-
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {
-                    "score": 0.9991,
-                    "label": "sky",
-                    "mask": {"hash": "8b1ffad016", "white_pixels": 230566, "shape": (512, 683)},
-                },
-                {
-                    "score": 0.9981,
-                    "label": "grass",
-                    "mask": {"hash": "9bbdf83d3d", "white_pixels": 119130, "shape": (512, 683)},
-                },
-            ],
-        )
-
-        # Different task
-        outputs = image_segmenter(file, subtask="semantic")
-        # Shortening by hashing
-        for o in outputs:
-            o["mask"] = mask_to_test_readable(o["mask"])
-
-        self.assertEqual(
-            nested_simplify(outputs, decimals=4),
-            [
-                {
-                    "score": None,
-                    "label": "wall",
-                    "mask": {"hash": "897fb20b7f", "white_pixels": 14506, "shape": (512, 683)},
-                },
-                {
-                    "score": None,
-                    "label": "building",
-                    "mask": {"hash": "f2a68c63e4", "white_pixels": 125019, "shape": (512, 683)},
-                },
-                {
-                    "score": None,
-                    "label": "sky",
-                    "mask": {"hash": "e0ca3a548e", "white_pixels": 135330, "shape": (512, 683)},
-                },
-                {
-                    "score": None,
-                    "label": "tree",
-                    "mask": {"hash": "7c9544bcac", "white_pixels": 16263, "shape": (512, 683)},
-                },
-                {
-                    "score": None,
-                    "label": "road, route",
-                    "mask": {"hash": "2c7704e491", "white_pixels": 2143, "shape": (512, 683)},
-                },
-                {
-                    "score": None,
-                    "label": "grass",
-                    "mask": {"hash": "bf6c2867e0", "white_pixels": 53040, "shape": (512, 683)},
-                },
-                {
-                    "score": None,
-                    "label": "plant",
-                    "mask": {"hash": "93c4b7199e", "white_pixels": 3335, "shape": (512, 683)},
-                },
-                {
-                    "score": None,
-                    "label": "house",
-                    "mask": {"hash": "93ec419ad5", "white_pixels": 60, "shape": (512, 683)},
-                },
-            ],
-        )
-
-    def test_save_load(self):
-        model_id = "hf-internal-testing/tiny-detr-mobilenetsv3-panoptic"
-
-        model = AutoModelForImageSegmentation.from_pretrained(model_id)
-        image_processor = AutoImageProcessor.from_pretrained(model_id)
-        image_segmenter = pipeline(
-            task="image-segmentation",
-            model=model,
-            image_processor=image_processor,
-        )
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            image_segmenter.save_pretrained(tmpdirname)
-            pipeline(task="image-segmentation", model=tmpdirname)
\ No newline at end of file
diff --git a/tests/transformers/pipelines/test_pipelines_question_answering.py b/tests/transformers/pipelines/test_pipelines_question_answering.py
deleted file mode 100644
index ef611b317..000000000
--- a/tests/transformers/pipelines/test_pipelines_question_answering.py
+++ /dev/null
@@ -1,491 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-# pylint: disable=missing-function-docstring
-# pylint: disable=missing-class-docstring
-# pylint: disable=no-else-return
-# pylint: disable=arguments-renamed
-# pylint: disable=missing-module-docstring
-# pylint: disable=invalid-name
-# pylint: disable=trailing-whitespace
-
-import unittest
-
-from mindnlp.transformers import (
-    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-    QuestionAnsweringPipeline,
-)
-from mindnlp.data.processors.squad import SquadExample
-from mindnlp.transformers.pipelines import pipeline
-from mindnlp.transformers.pipelines.question_answering import QuestionAnsweringArgumentHandler
-from mindnlp.utils.testing_utils import (
-    is_pipeline_test,
-    nested_simplify,
-    require_mindspore,
-    slow,
-)
-
-from .test_pipelines_common import ANY
-
-
-# These 2 model types require different inputs than those of the usual text models.
-_TO_SKIP = {"LayoutLMv2Config", "LayoutLMv3Config"}
-
-
-@is_pipeline_test
-class QAPipelineTests(unittest.TestCase):
-    model_mapping = MODEL_FOR_QUESTION_ANSWERING_MAPPING
-
-    if model_mapping is not None:
-        model_mapping = {config: model for config, model in model_mapping.items() if config.__name__ not in _TO_SKIP}
-
-    def get_test_pipeline(self, model, tokenizer):
-        question_answerer = QuestionAnsweringPipeline(model, tokenizer)
-
-        examples = [
-            {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
-            {"question": "In what field is HuggingFace ?", "context": "HuggingFace is  an AI startup."},
-        ]
-        return question_answerer, examples
-
-    def run_pipeline_test(self, question_answerer, _):
-        outputs = question_answerer(
-            question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris."
-        )
-        self.assertEqual(outputs, {"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)})
-        outputs = question_answerer(
-            question="Where was HuggingFace founded ?",
-            context="HuggingFace was founded in Paris.",
-            handle_impossible_answer=True,
-        )
-        self.assertEqual(outputs, {"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)})
-
-        outputs = question_answerer(
-            question=["In what field is HuggingFace working ?", "In what field is HuggingFace working ?"],
-            context="HuggingFace was founded in Paris.",
-        )
-        self.assertEqual(
-            outputs,
-            [
-                {"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)},
-                {"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)},
-            ],
-        )
-
-        outputs = question_answerer(
-            question=["What field is HuggingFace working ?", "In what field is HuggingFace ?"],
-            context=[
-                "HuggingFace is a startup based in New-York",
-                "HuggingFace is a startup founded in Paris",
-            ],
-        )
-        self.assertEqual(
-            outputs,
-            [
-                {"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)},
-                {"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)},
-            ],
-        )
-
-        with self.assertRaises(ValueError):
-            question_answerer(question="", context="HuggingFace was founded in Paris.")
-        with self.assertRaises(ValueError):
-            question_answerer(question=None, context="HuggingFace was founded in Paris.")
-        with self.assertRaises(ValueError):
-            question_answerer(question="In what field is HuggingFace working ?", context="")
-        with self.assertRaises(ValueError):
-            question_answerer(question="In what field is HuggingFace working ?", context=None)
-
-        outputs = question_answerer(
-            question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris.", top_k=20
-        )
-        self.assertEqual(
-            outputs, [{"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)} for i in range(20)]
-        )
-
-        # Very long context require multiple features
-        outputs = question_answerer(
-            question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris." * 20
-        )
-        self.assertEqual(outputs, {"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)})
-
-        # Using batch is OK
-        if question_answerer.tokenizer.pad_token_id is None:
-            question_answerer.tokenizer.pad_token_id = question_answerer.model.config.eos_token_id
-        new_outputs = question_answerer(
-            question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris." * 20, batch_size=2
-        )
-        self.assertEqual(new_outputs, {"answer": ANY(str), "start": ANY(int), "end": ANY(int), "score": ANY(float)})
-        self.assertEqual(nested_simplify(outputs), nested_simplify(new_outputs))
-
-    @require_mindspore
-    def test_small_model_pt(self):
-        question_answerer = pipeline(
-            "question-answering", model="sshleifer/tiny-distilbert-base-cased-distilled-squad"
-        )
-
-        outputs = question_answerer(
-            question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris."
-        )
-
-        self.assertEqual(nested_simplify(outputs), {"score": 0.01, "start": 0, "end": 11, "answer": "HuggingFace"})
-
-    @require_mindspore
-    def test_small_model_pt_iterator(self):
-        # https://github.com/huggingface/transformers/issues/18510
-        pipe = pipeline(model="sshleifer/tiny-distilbert-base-cased-distilled-squad", batch_size=16, framework="ms")
-
-        def data():
-            for _ in range(10):
-                yield {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."}
-
-        for outputs in pipe(data()):
-            self.assertEqual(nested_simplify(outputs), {"score": 0.01, "start": 0, "end": 11, "answer": "HuggingFace"})
-
-    @require_mindspore
-    def test_small_model_pt_softmax_trick(self):
-        question_answerer = pipeline(
-            "question-answering", model="sshleifer/tiny-distilbert-base-cased-distilled-squad"
-        )
-
-        real_postprocess = question_answerer.postprocess
-
-        # Tweak start and stop to make sure we encounter the softmax logits
-        # bug.
-        def ensure_large_logits_postprocess(
-            model_outputs,
-            top_k=1,
-            handle_impossible_answer=False,
-            max_answer_len=15,
-        ):
-            for output in model_outputs:
-                output["start"] = output["start"] * 1e6
-                output["end"] = output["end"] * 1e6
-            return real_postprocess(
-                model_outputs,
-                top_k=top_k,
-                handle_impossible_answer=handle_impossible_answer,
-                max_answer_len=max_answer_len,
-            )
-
-        question_answerer.postprocess = ensure_large_logits_postprocess
-
-        outputs = question_answerer(
-            question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris."
-        )
-
-        self.assertEqual(nested_simplify(outputs), {"score": 0.028, "start": 0, "end": 11, "answer": "HuggingFace"})
-
-    @slow
-    @require_mindspore
-    def test_small_model_long_context_cls_slow(self):
-        question_answerer = pipeline(
-            "question-answering",
-            model="deepset/roberta-base-squad2",
-            handle_impossible_answer=True,
-            max_seq_length=512,
-        )
-        outputs = question_answerer(
-            question="What country is Paris the capital of?",
-            context="""London is the capital and largest city of England and the United Kingdom. It stands on the 
-            River Thames in south-east England at the head of a 50-mile (80 km) estuary down to the North Sea,
-            and has been a major settlement for two millennia. The City of London, its ancient core and financial 
-            centre, was founded by the Romans as Londinium and retains boundaries close to its medieval ones. Since 
-            the 19th century, \"London\" has also referred to the metropolis around this core, historically split 
-            between the counties of Middlesex, Essex, Surrey, Kent, and Hertfordshire, which largely comprises 
-            Greater London, governed by the Greater London Authority. The City of Westminster, to the west of the 
-            City of London, has for centuries held the national government and parliament. As one of the world's 
-            global cities, London exerts strong influence on its arts, commerce, education, entertainment, fashion, 
-            finance, health care, media, tourism, and communications, and has sometimes been called the capital of 
-            the world. Its GDP (€801.66 billion in 2017) makes it the biggest urban economy in Europe, and it is one 
-            of the major financial centres in the world. In 2019 it had the second-highest number of ultra 
-            high-net-worth individuals in Europe after Paris and the second-highest number of billionaires in Europe 
-            after Moscow. As of 2021, London has the most millionaires of any city. With Europe's largest 
-            concentration of higher education institutions, it includes Imperial College London in natural and 
-            applied sciences, the London School of Economics in social sciences, and the comprehensive University 
-            College London. The city is home to the most 5-star hotels of any city in the world. In 2012, 
-            London became the first city to host three Summer Olympic Games. London is the capital and largest city 
-            of England and the United Kingdom. It stands on the River Thames in south-east England at the head of a 
-            50-mile (80 km) estuary down to the North Sea, and has been a major settlement for two millennia. The 
-            City of London, its ancient core and financial centre, was founded by the Romans as Londinium and retains 
-            boundaries close to its medieval ones. Since the 19th century, \"London\" has also referred to the 
-            metropolis around this core, historically split between the counties of Middlesex, Essex, Surrey, Kent, 
-            and Hertfordshire, which largely comprises Greater London, governed by the Greater London Authority. The 
-            City of Westminster, to the west of the City of London, has for centuries held the national government 
-            and parliament. As one of the world's global cities, London exerts strong influence on its arts, 
-            commerce, education, entertainment, fashion, finance, health care, media, tourism, and communications, 
-            and has sometimes been called the capital of the world. Its GDP (€801.66 billion in 2017) makes it the 
-            biggest urban economy in Europe, and it is one of the major financial centres in the world. In 2019 it 
-            had the second-highest number of ultra high-net-worth individuals in Europe after Paris and the 
-            second-highest number of billionaires in Europe after Moscow. As of 2021, London has the most 
-            millionaires of any city. With Europe's largest concentration of higher education institutions, 
-            it includes Imperial College London in natural and applied sciences, the London School of Economics in 
-            social sciences, and the comprehensive University College London. The city is home to the most 5-star 
-            hotels of any city in the world. In 2012, London became the first city to host three Summer Olympic 
-            Games.""",
-        )
-        self.assertEqual(nested_simplify(outputs), {"score": 0.988, "start": 0, "end": 0, "answer": ""})
-
-    @slow
-    @require_mindspore
-    def test_large_model_pt(self):
-        question_answerer = pipeline(
-            "question-answering",
-        )
-        outputs = question_answerer(
-            question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris."
-        )
-
-        self.assertEqual(nested_simplify(outputs), {"score": 0.979, "start": 27, "end": 32, "answer": "Paris"})
-
-    @slow
-    @require_mindspore
-    def test_large_model_issue(self):
-        qa_pipeline = pipeline(
-            "question-answering",
-            model="mrm8488/bert-multi-cased-finetuned-xquadv1",
-        )
-        outputs = qa_pipeline(
-            {
-                "context": (
-                    "Yes Bank founder Rana Kapoor has approached the Bombay High Court, challenging a special court's"
-                    " order from August this year that had remanded him in police custody for a week in a multi-crore"
-                    " loan fraud case. Kapoor, who is currently lodged in Taloja Jail, is an accused in the loan fraud"
-                    " case and some related matters being probed by the CBI and Enforcement Directorate. A single"
-                    " bench presided over by Justice S K Shinde on Tuesday posted the plea for further hearing on"
-                    " October 14. In his plea filed through advocate Vijay Agarwal, Kapoor claimed that the special"
-                    " court's order permitting the CBI's request for police custody on August 14 was illegal and in"
-                    " breach of the due process of law. Therefore, his police custody and subsequent judicial custody"
-                    " in the case were all illegal. Kapoor has urged the High Court to quash and set aside the special"
-                    " court's order dated August 14. As per his plea, in August this year, the CBI had moved two"
-                    " applications before the special court, one seeking permission to arrest Kapoor, who was already"
-                    " in judicial custody at the time in another case, and the other, seeking his police custody."
-                    " While the special court refused to grant permission to the CBI to arrest Kapoor, it granted the"
-                    " central agency's plea for his custody. Kapoor, however, said in his plea that before filing an"
-                    " application for his arrest, the CBI had not followed the process of issuing him a notice under"
-                    " Section 41 of the CrPC for appearance before it. He further said that the CBI had not taken"
-                    " prior sanction as mandated under section 17 A of the Prevention of Corruption Act for"
-                    " prosecuting him. The special court, however, had said in its order at the time that as Kapoor"
-                    " was already in judicial custody in another case and was not a free man the procedure mandated"
-                    " under Section 41 of the CrPC need not have been adhered to as far as issuing a prior notice of"
-                    " appearance was concerned. ADVERTISING It had also said that case records showed that the"
-                    " investigating officer had taken an approval from a managing director of Yes Bank before"
-                    " beginning the proceedings against Kapoor and such a permission was a valid sanction. However,"
-                    " Kapoor in his plea said that the above order was bad in law and sought that it be quashed and"
-                    " set aside. The law mandated that if initial action was not in consonance with legal procedures,"
-                    " then all subsequent actions must be held as illegal, he said, urging the High Court to declare"
-                    " the CBI remand and custody and all subsequent proceedings including the further custody as"
-                    " illegal and void ab-initio. In a separate plea before the High Court, Kapoor's daughter Rakhee"
-                    " Kapoor-Tandon has sought exemption from in-person appearance before a special PMLA court. Rakhee"
-                    " has stated that she is a resident of the United Kingdom and is unable to travel to India owing"
-                    " to restrictions imposed due to the COVID-19 pandemic. According to the CBI, in the present case,"
-                    " Kapoor had obtained a gratification or pecuniary advantage of ₹ 307 crore, and thereby caused"
-                    " Yes Bank a loss of ₹ 1,800 crore by extending credit facilities to Avantha Group, when it was"
-                    " not eligible for the same"
-                ),
-                "question": "Is this person invovled in fraud?",
-            }
-        )
-        self.assertEqual(
-            nested_simplify(outputs),
-            {"answer": "an accused in the loan fraud case", "end": 294, "score": 0.001, "start": 261},
-        )
-
-    @slow
-    @require_mindspore
-    def test_large_model_course(self):
-        question_answerer = pipeline("question-answering")
-        long_context = """
-🤗 Transformers: State of the Art NLP
-
-🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction,
-question answering, summarization, translation, text generation and more in over 100 languages.
-Its aim is to make cutting-edge NLP easier to use for everyone.
-
-🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and
-then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and
-can be modified to enable quick research experiments.
-
-Why should I use transformers?
-
-1. Easy-to-use state-of-the-art models:
-  - High performance on NLU and NLG tasks.
-  - Low barrier to entry for educators and practitioners.
-  - Few user-facing abstractions with just three classes to learn.
-  - A unified API for using all our pretrained models.
-  - Lower compute costs, smaller carbon footprint:
-
-2. Researchers can share trained models instead of always retraining.
-  - Practitioners can reduce compute time and production costs.
-  - Dozens of architectures with over 10,000 pretrained models, some in more than 100 languages.
-
-3. Choose the right framework for every part of a model's lifetime:
-  - Train state-of-the-art models in 3 lines of code.
-  - Move a single model between TF2.0/PyTorch frameworks at will.
-  - Seamlessly pick the right framework for training, evaluation and production.
-
-4. Easily customize a model or an example to your needs:
-  - We provide examples for each architecture to reproduce the results published by its original authors.
-  - Model internals are exposed as consistently as possible.
-  - Model files can be used independently of the library for quick experiments.
-
-🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
-between them. It's straightforward to train your models with one before loading them for inference with the other.
-"""
-        question = "Which deep learning libraries back 🤗 Transformers?"
-        outputs = question_answerer(question=question, context=long_context)
-
-        self.assertEqual(
-            nested_simplify(outputs),
-            {"answer": "Jax, PyTorch and TensorFlow", "end": 1919, "score": 0.971, "start": 1892},
-        )
-
-
-@require_mindspore
-class QuestionAnsweringArgumentHandlerTests(unittest.TestCase):
-    def test_argument_handler(self):
-        qa = QuestionAnsweringArgumentHandler()
-
-        Q = "Where was HuggingFace founded ?"
-        C = "HuggingFace was founded in Paris"
-
-        normalized = qa(Q, C)
-        self.assertEqual(type(normalized), list)
-        self.assertEqual(len(normalized), 1)
-        self.assertEqual({type(el) for el in normalized}, {SquadExample})
-
-        normalized = qa(question=Q, context=C)
-        self.assertEqual(type(normalized), list)
-        self.assertEqual(len(normalized), 1)
-        self.assertEqual({type(el) for el in normalized}, {SquadExample})
-
-        normalized = qa(question=Q, context=C)
-        self.assertEqual(type(normalized), list)
-        self.assertEqual(len(normalized), 1)
-        self.assertEqual({type(el) for el in normalized}, {SquadExample})
-
-        normalized = qa(question=[Q, Q], context=C)
-        self.assertEqual(type(normalized), list)
-        self.assertEqual(len(normalized), 2)
-        self.assertEqual({type(el) for el in normalized}, {SquadExample})
-
-        normalized = qa({"question": Q, "context": C})
-        self.assertEqual(type(normalized), list)
-        self.assertEqual(len(normalized), 1)
-        self.assertEqual({type(el) for el in normalized}, {SquadExample})
-
-        normalized = qa([{"question": Q, "context": C}])
-        self.assertEqual(type(normalized), list)
-        self.assertEqual(len(normalized), 1)
-        self.assertEqual({type(el) for el in normalized}, {SquadExample})
-
-        normalized = qa([{"question": Q, "context": C}, {"question": Q, "context": C}])
-        self.assertEqual(type(normalized), list)
-        self.assertEqual(len(normalized), 2)
-        self.assertEqual({type(el) for el in normalized}, {SquadExample})
-
-        normalized = qa(X={"question": Q, "context": C})
-        self.assertEqual(type(normalized), list)
-        self.assertEqual(len(normalized), 1)
-        self.assertEqual({type(el) for el in normalized}, {SquadExample})
-
-        normalized = qa(X=[{"question": Q, "context": C}])
-        self.assertEqual(type(normalized), list)
-        self.assertEqual(len(normalized), 1)
-        self.assertEqual({type(el) for el in normalized}, {SquadExample})
-
-        normalized = qa(data={"question": Q, "context": C})
-        self.assertEqual(type(normalized), list)
-        self.assertEqual(len(normalized), 1)
-        self.assertEqual({type(el) for el in normalized}, {SquadExample})
-
-    def test_argument_handler_error_handling(self):
-        qa = QuestionAnsweringArgumentHandler()
-
-        Q = "Where was HuggingFace founded ?"
-        C = "HuggingFace was founded in Paris"
-
-        with self.assertRaises(KeyError):
-            qa({"context": C})
-        with self.assertRaises(KeyError):
-            qa({"question": Q})
-        with self.assertRaises(KeyError):
-            qa([{"context": C}])
-        with self.assertRaises(ValueError):
-            qa(None, C)
-        with self.assertRaises(ValueError):
-            qa("", C)
-        with self.assertRaises(ValueError):
-            qa(Q, None)
-        with self.assertRaises(ValueError):
-            qa(Q, "")
-
-        with self.assertRaises(ValueError):
-            qa(question=None, context=C)
-        with self.assertRaises(ValueError):
-            qa(question="", context=C)
-        with self.assertRaises(ValueError):
-            qa(question=Q, context=None)
-        with self.assertRaises(ValueError):
-            qa(question=Q, context="")
-
-        with self.assertRaises(ValueError):
-            qa({"question": None, "context": C})
-        with self.assertRaises(ValueError):
-            qa({"question": "", "context": C})
-        with self.assertRaises(ValueError):
-            qa({"question": Q, "context": None})
-        with self.assertRaises(ValueError):
-            qa({"question": Q, "context": ""})
-
-        with self.assertRaises(ValueError):
-            qa([{"question": Q, "context": C}, {"question": None, "context": C}])
-        with self.assertRaises(ValueError):
-            qa([{"question": Q, "context": C}, {"question": "", "context": C}])
-
-        with self.assertRaises(ValueError):
-            qa([{"question": Q, "context": C}, {"question": Q, "context": None}])
-        with self.assertRaises(ValueError):
-            qa([{"question": Q, "context": C}, {"question": Q, "context": ""}])
-
-        with self.assertRaises(ValueError):
-            qa(question={"This": "Is weird"}, context="This is a context")
-
-        with self.assertRaises(ValueError):
-            qa(question=[Q, Q], context=[C, C, C])
-
-        with self.assertRaises(ValueError):
-            qa(question=[Q, Q, Q], context=[C, C])
-
-    def test_argument_handler_old_format(self):
-        qa = QuestionAnsweringArgumentHandler()
-
-        Q = "Where was HuggingFace founded ?"
-        C = "HuggingFace was founded in Paris"
-        # Backward compatibility for this
-        normalized = qa(question=[Q, Q], context=[C, C])
-        self.assertEqual(type(normalized), list)
-        self.assertEqual(len(normalized), 2)
-        self.assertEqual({type(el) for el in normalized}, {SquadExample})
-
-    def test_argument_handler_error_handling_odd(self):
-        qa = QuestionAnsweringArgumentHandler()
-        with self.assertRaises(ValueError):
-            qa(None)
-
-        with self.assertRaises(ValueError):
-            qa(Y=None)
-
-        with self.assertRaises(ValueError):
-            qa(1)
diff --git a/tests/transformers/pipelines/test_pipelines_table_question_answering.py b/tests/transformers/pipelines/test_pipelines_table_question_answering.py
deleted file mode 100644
index e4097fd3e..000000000
--- a/tests/transformers/pipelines/test_pipelines_table_question_answering.py
+++ /dev/null
@@ -1,353 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from mindnlp.transformers import (
-    MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
-    AutoModelForTableQuestionAnswering,
-    AutoTokenizer, pipeline
-)
-
-from mindnlp.transformers.pipelines.table_question_answering import (
-    TableQuestionAnsweringPipeline
-)
-
-from mindnlp.utils.testing_utils import is_pipeline_test, require_mindspore, slow
-
-
-@is_pipeline_test
-class TQAPipelineTests(unittest.TestCase):
-    # Putting it there for consistency, but TQA do not have fast tokenizer
-    # which are needed to generate automatic tests
-    model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
-
-    @require_mindspore
-    def test_small_model(self):
-        model_id = "lysandre/tiny-tapas-random-wtq"
-        model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        self.assertIsInstance(model.config.aggregation_labels, dict)
-        self.assertIsInstance(model.config.no_aggregation_label_index, int)
-
-        table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer)
-        outputs = table_querier(
-            table={
-                "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
-                "age": ["56", "45", "59"],
-                "number of movies": ["87", "53", "69"],
-                "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
-            },
-            query="how many movies has george clooney played in?",
-        )
-        self.assertEqual(
-            outputs,
-            [{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"}],
-        )
-        outputs = table_querier(
-            table={
-                "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
-                "age": ["56", "45", "59"],
-                "number of movies": ["87", "53", "69"],
-                "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
-            },
-            query=["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
-        )
-        self.assertEqual(
-            outputs,
-            [[
-                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
-                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
-                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
-            ]],
-        )
-        outputs = table_querier(
-            table={
-                "Repository": ["Transformers", "Datasets", "Tokenizers"],
-                "Stars": ["36542", "4512", "3934"],
-                "Contributors": ["651", "77", "34"],
-                "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
-            },
-            query=[
-                "What repository has the largest number of stars?",
-                "Given that the numbers of stars defines if a repository is active, what repository is the most"
-                " active?",
-                "What is the number of repositories?",
-                "What is the average number of stars?",
-                "What is the total amount of stars?",
-            ],
-        )
-        self.assertEqual(
-            outputs,
-            [[
-                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
-                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
-                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
-                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
-                {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
-            ]],
-        )
-
-        with self.assertRaises(ValueError):
-            table_querier(query="What does it do with empty context ?", table=None)
-        with self.assertRaises(ValueError):
-            table_querier(query="What does it do with empty context ?", table="")
-        with self.assertRaises(ValueError):
-            table_querier(query="What does it do with empty context ?", table={})
-        with self.assertRaises(ValueError):
-            table_querier(
-                table={
-                    "Repository": ["Transformers", "Datasets", "Tokenizers"],
-                    "Stars": ["36542", "4512", "3934"],
-                    "Contributors": ["651", "77", "34"],
-                    "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
-                }
-            )
-        with self.assertRaises(ValueError):
-            table_querier(
-                query="",
-                table={
-                    "Repository": ["Transformers", "Datasets", "Tokenizers"],
-                    "Stars": ["36542", "4512", "3934"],
-                    "Contributors": ["651", "77", "34"],
-                    "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
-                },
-            )
-        with self.assertRaises(ValueError):
-            table_querier(
-                query=None,
-                table={
-                    "Repository": ["Transformers", "Datasets", "Tokenizers"],
-                    "Stars": ["36542", "4512", "3934"],
-                    "Contributors": ["651", "77", "34"],
-                    "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
-                },
-            )
-
-    @require_mindspore
-    def test_slow_tokenizer_sqa(self):
-        model_id = "lysandre/tiny-tapas-random-sqa"
-        model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer)
-
-        inputs = {
-            "table": {
-                "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
-                "age": ["56", "45", "59"],
-                "number of movies": ["87", "53", "69"],
-                "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
-            },
-            "query": ["how many movies has george clooney played in?", "how old is he?",
-                      "what's his date of birth?"],
-        }
-        sequential_outputs = table_querier(**inputs, sequential=True)[0]
-        batch_outputs = table_querier(**inputs, sequential=False)[0]
-
-        self.assertEqual(len(sequential_outputs), 3)
-        self.assertEqual(len(batch_outputs), 3)
-        self.assertEqual(sequential_outputs[0], batch_outputs[0])
-        self.assertNotEqual(sequential_outputs[1], batch_outputs[1])
-        # self.assertNotEqual(sequential_outputs[2], batch_outputs[2])
-
-        table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer)
-        outputs = table_querier(
-            table={
-                "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
-                "age": ["56", "45", "59"],
-                "number of movies": ["87", "53", "69"],
-                "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
-            },
-            query="how many movies has george clooney played in?",
-        )
-        self.assertEqual(
-            outputs,
-            [{"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]}],
-        )
-        outputs = table_querier(
-            table={
-                "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
-                "age": ["56", "45", "59"],
-                "number of movies": ["87", "53", "69"],
-                "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
-            },
-            query=["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
-        )
-        self.assertEqual(
-            outputs,
-            [[
-                {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
-                {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
-                {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
-            ]],
-        )
-        outputs = table_querier(
-            table={
-                "Repository": ["Transformers", "Datasets", "Tokenizers"],
-                "Stars": ["36542", "4512", "3934"],
-                "Contributors": ["651", "77", "34"],
-                "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
-            },
-            query=[
-                "What repository has the largest number of stars?",
-                "Given that the numbers of stars defines if a repository is active, what repository is the most"
-                " active?",
-                "What is the number of repositories?",
-                "What is the average number of stars?",
-                "What is the total amount of stars?",
-            ],
-        )
-        self.assertEqual(
-            outputs,
-            [[
-                {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
-                {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
-                {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
-                {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
-                {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
-            ]],
-        )
-
-        with self.assertRaises(ValueError):
-            table_querier(query="What does it do with empty context ?", table=None)
-        with self.assertRaises(ValueError):
-            table_querier(query="What does it do with empty context ?", table="")
-        with self.assertRaises(ValueError):
-            table_querier(query="What does it do with empty context ?", table={})
-        with self.assertRaises(ValueError):
-            table_querier(
-                table={
-                    "Repository": ["Transformers", "Datasets", "Tokenizers"],
-                    "Stars": ["36542", "4512", "3934"],
-                    "Contributors": ["651", "77", "34"],
-                    "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
-                }
-            )
-        with self.assertRaises(ValueError):
-            table_querier(
-                query="",
-                table={
-                    "Repository": ["Transformers", "Datasets", "Tokenizers"],
-                    "Stars": ["36542", "4512", "3934"],
-                    "Contributors": ["651", "77", "34"],
-                    "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
-                },
-            )
-        with self.assertRaises(ValueError):
-            table_querier(
-                query=None,
-                table={
-                    "Repository": ["Transformers", "Datasets", "Tokenizers"],
-                    "Stars": ["36542", "4512", "3934"],
-                    "Contributors": ["651", "77", "34"],
-                    "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
-                },
-            )
-
-    @slow
-    @require_mindspore
-    def test_integration_wtq(self):
-        table_querier = pipeline("table-question-answering", model="google/tapas-base-finetuned-wtq")
-
-        data = {
-            "Repository": ["Transformers", "Datasets", "Tokenizers"],
-            "Stars": ["36542", "4512", "3934"],
-            "Contributors": ["651", "77", "34"],
-            "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
-        }
-        queries = [
-            "What repository has the largest number of stars?",
-            "Given that the numbers of stars defines if a repository is active, what repository is the most active?",
-            "What is the number of repositories?",
-            "What is the average number of stars?",
-            "What is the total amount of stars?",
-        ]
-
-        results = table_querier(data, queries)
-
-        expected_results = [
-            {"answer": "Transformers", "coordinates": [(0, 0)], "cells": ["Transformers"], "aggregator": "NONE"},
-            {"answer": "Transformers", "coordinates": [(0, 0)], "cells": ["Transformers"], "aggregator": "NONE"},
-            {
-                "answer": "COUNT > Transformers, Datasets, Tokenizers",
-                "coordinates": [(0, 0), (1, 0), (2, 0)],
-                "cells": ["Transformers", "Datasets", "Tokenizers"],
-                "aggregator": "COUNT",
-            },
-            {
-                "answer": "AVERAGE > 36542, 4512, 3934",
-                "coordinates": [(0, 1), (1, 1), (2, 1)],
-                "cells": ["36542", "4512", "3934"],
-                "aggregator": "AVERAGE",
-            },
-            {
-                "answer": "SUM > 36542, 4512, 3934",
-                "coordinates": [(0, 1), (1, 1), (2, 1)],
-                "cells": ["36542", "4512", "3934"],
-                "aggregator": "SUM",
-            },
-        ]
-        self.assertListEqual(results[0], expected_results)
-
-    @slow
-    @require_mindspore
-    def test_integration_sqa(self):
-        table_querier = pipeline(
-            "table-question-answering",
-            model="google/tapas-base-finetuned-sqa",
-            tokenizer="google/tapas-base-finetuned-sqa",
-        )
-        data = {
-            "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-            "Age": ["56", "45", "59"],
-            "Number of movies": ["87", "53", "69"],
-            "Date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
-        }
-        queries = ["How many movies has George Clooney played in?", "How old is he?", "What's his date of birth?"]
-        results = table_querier(data, queries, sequential=True)
-
-        expected_results = [
-            {"answer": "69", "coordinates": [(2, 2)], "cells": ["69"]},
-            {"answer": "59", "coordinates": [(2, 1)], "cells": ["59"]},
-            {"answer": "28 november 1967", "coordinates": [(2, 3)], "cells": ["28 november 1967"]},
-        ]
-        self.assertListEqual(results[0], expected_results)
-
-    @slow
-    @require_mindspore
-    def test_large_model_tapex(self):
-        model_id = "microsoft/tapex-large-finetuned-wtq"
-        table_querier = pipeline(
-            "table-question-answering",
-            model=model_id,
-        )
-        data = {
-            "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-            "Age": ["56", "45", "59"],
-            "Number of movies": ["87", "53", "69"],
-            "Date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
-        }
-        queries = [
-            "How many movies has George Clooney played in?",
-            "How old is Mr Clooney ?",
-            "What's the date of birth of Leonardo ?",
-        ]
-        results = table_querier(data, queries, sequential=True)
-
-        expected_results = [
-            {"answer": " 69"},
-            {"answer": " 59"},
-            {"answer": " 10 june 1996"},
-        ]
-        self.assertListEqual(results[0], expected_results)
diff --git a/tests/transformers/pipelines/test_pipelines_text2text_generation.py b/tests/transformers/pipelines/test_pipelines_text2text_generation.py
deleted file mode 100644
index 91254c744..000000000
--- a/tests/transformers/pipelines/test_pipelines_text2text_generation.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import mindspore
-from mindspore import Tensor
-
-from mindnlp.transformers import (
-    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
-    Text2TextGenerationPipeline,
-    pipeline,
-)
-from mindnlp.utils.testing_utils import is_pipeline_test, require_mindspore
-
-from .test_pipelines_common import ANY
-
-
-@is_pipeline_test
-class Text2TextGenerationPipelineTests(unittest.TestCase):
-    model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
-
-    def get_test_pipeline(self, model, tokenizer, processor):
-        generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer)
-        return generator, ["Something to write", "Something else"]
-
-    def run_pipeline_test(self, generator, _):
-        outputs = generator("Something there")
-        self.assertEqual(outputs, [{"generated_text": ANY(str)}])
-        # These are encoder decoder, they don't just append to incoming string
-        self.assertFalse(outputs[0]["generated_text"].startswith("Something there"))
-
-        outputs = generator(["This is great !", "Something else"], num_return_sequences=2, do_sample=True)
-        self.assertEqual(
-            outputs,
-            [
-                [{"generated_text": ANY(str)}, {"generated_text": ANY(str)}],
-                [{"generated_text": ANY(str)}, {"generated_text": ANY(str)}],
-            ],
-        )
-
-        outputs = generator(
-            ["This is great !", "Something else"], num_return_sequences=2, batch_size=2, do_sample=True
-        )
-        self.assertEqual(
-            outputs,
-            [
-                [{"generated_text": ANY(str)}, {"generated_text": ANY(str)}],
-                [{"generated_text": ANY(str)}, {"generated_text": ANY(str)}],
-            ],
-        )
-
-        with self.assertRaises(ValueError):
-            generator(4)
-
-    @require_mindspore
-    def test_small_model(self):
-        generator = pipeline("text2text-generation", model="patrickvonplaten/t5-tiny-random")
-        # do_sample=False necessary for reproducibility
-        outputs = generator("Something there", do_sample=False)
-        self.assertEqual(outputs, [{"generated_text": ""}])
-
-        num_return_sequences = 3
-        outputs = generator(
-            "Something there",
-            num_return_sequences=num_return_sequences,
-            num_beams=num_return_sequences,
-        )
-        target_outputs = [
-            {"generated_text": "Beide Beide Beide Beide Beide Beide Beide Beide Beide"},
-            {"generated_text": "Beide Beide Beide Beide Beide Beide Beide Beide"},
-            {"generated_text": ""},
-        ]
-        self.assertEqual(outputs, target_outputs)
-
-        outputs = generator("This is a test", do_sample=True, num_return_sequences=2, return_tensors=True)
-        generator.tokenizer.pad_token_id = generator.model.config.eos_token_id
-        generator.tokenizer.pad_token = "<pad>"
-        outputs = generator(
-            ["This is a test", "This is a second test"],
-            do_sample=True,
-            num_return_sequences=2,
-            batch_size=2,
-            return_tensors=True,
-        )
-
diff --git a/tests/transformers/pipelines/test_pipelines_text_classification.py b/tests/transformers/pipelines/test_pipelines_text_classification.py
deleted file mode 100644
index b974242db..000000000
--- a/tests/transformers/pipelines/test_pipelines_text_classification.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from mindnlp.transformers import (
-    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-    TextClassificationPipeline,
-    pipeline,
-)
-from mindnlp.utils.testing_utils import is_pipeline_test, nested_simplify, require_mindspore, slow
-
-from .test_pipelines_common import ANY
-
-
-# These 2 model types require different inputs than those of the usual text models.
-_TO_SKIP = {"LayoutLMv2Config", "LayoutLMv3Config"}
-
-
-@is_pipeline_test
-class TextClassificationPipelineTests(unittest.TestCase):
-    model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
-
-    if model_mapping is not None:
-        model_mapping = {config: model for config, model in model_mapping.items() if config.__name__ not in _TO_SKIP}
-
-    @require_mindspore
-    def test_small_model(self):
-        text_classifier = pipeline(
-            task="text-classification", model="hf-internal-testing/tiny-random-distilbert"
-        )
-
-        outputs = text_classifier("This is great !")
-        self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}])
-
-        outputs = text_classifier("This is great !", top_k=2)
-        self.assertEqual(
-            nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}, {"label": "LABEL_1", "score": 0.496}]
-        )
-
-        outputs = text_classifier(["This is great !", "This is bad"], top_k=2)
-        self.assertEqual(
-            nested_simplify(outputs),
-            [
-                [{"label": "LABEL_0", "score": 0.504}, {"label": "LABEL_1", "score": 0.496}],
-                [{"label": "LABEL_0", "score": 0.504}, {"label": "LABEL_1", "score": 0.496}],
-            ],
-        )
-
-        outputs = text_classifier("This is great !", top_k=1)
-        self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}])
-
-        # Legacy behavior
-        outputs = text_classifier("This is great !", return_all_scores=False)
-        self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.504}])
-
-        outputs = text_classifier("This is great !", return_all_scores=True)
-        self.assertEqual(
-            nested_simplify(outputs), [[{"label": "LABEL_0", "score": 0.504}, {"label": "LABEL_1", "score": 0.496}]]
-        )
-
-        outputs = text_classifier(["This is great !", "Something else"], return_all_scores=True)
-        self.assertEqual(
-            nested_simplify(outputs),
-            [
-                [{"label": "LABEL_0", "score": 0.504}, {"label": "LABEL_1", "score": 0.496}],
-                [{"label": "LABEL_0", "score": 0.504}, {"label": "LABEL_1", "score": 0.496}],
-            ],
-        )
-
-        outputs = text_classifier(["This is great !", "Something else"], return_all_scores=False)
-        self.assertEqual(
-            nested_simplify(outputs),
-            [
-                {"label": "LABEL_0", "score": 0.504},
-                {"label": "LABEL_0", "score": 0.504},
-            ],
-        )
-
-    @slow
-    @require_mindspore
-    def test_bert(self):
-        text_classifier = pipeline("text-classification")
-
-        outputs = text_classifier("This is great !")
-        self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 1.0}])
-        outputs = text_classifier("This is bad !")
-        self.assertEqual(nested_simplify(outputs), [{"label": "NEGATIVE", "score": 1.0}])
-        outputs = text_classifier("Birds are a type of animal")
-        self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
-
-    def get_test_pipeline(self, model, tokenizer, processor):
-        text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
-        return text_classifier, ["HuggingFace is in", "This is another test"]
-
-    def run_pipeline_test(self, text_classifier, _):
-        model = text_classifier.model
-        # Small inputs because BartTokenizer tiny has maximum position embeddings = 22
-        valid_inputs = "HuggingFace is in"
-        outputs = text_classifier(valid_inputs)
-
-        self.assertEqual(nested_simplify(outputs), [{"label": ANY(str), "score": ANY(float)}])
-        self.assertTrue(outputs[0]["label"] in model.config.id2label.values())
-
-        valid_inputs = ["HuggingFace is in ", "Paris is in France"]
-        outputs = text_classifier(valid_inputs)
-        self.assertEqual(
-            nested_simplify(outputs),
-            [{"label": ANY(str), "score": ANY(float)}, {"label": ANY(str), "score": ANY(float)}],
-        )
-        self.assertTrue(outputs[0]["label"] in model.config.id2label.values())
-        self.assertTrue(outputs[1]["label"] in model.config.id2label.values())
-
-        # Forcing to get all results with `top_k=None`
-        # This is NOT the legacy format
-        outputs = text_classifier(valid_inputs, top_k=None)
-        N = len(model.config.id2label.values())
-        self.assertEqual(
-            nested_simplify(outputs),
-            [[{"label": ANY(str), "score": ANY(float)}] * N, [{"label": ANY(str), "score": ANY(float)}] * N],
-        )
-
-        valid_inputs = {"text": "HuggingFace is in ", "text_pair": "Paris is in France"}
-        outputs = text_classifier(valid_inputs)
-        self.assertEqual(
-            nested_simplify(outputs),
-            {"label": ANY(str), "score": ANY(float)},
-        )
-        self.assertTrue(outputs["label"] in model.config.id2label.values())
-
-        # This might be used a text pair, but tokenizer + pipe interaction
-        # makes it hard to understand that it's not using the pair properly
-        # https://github.com/huggingface/transformers/issues/17305
-        # We disabled this usage instead as it was outputting wrong outputs.
-        invalid_input = [["HuggingFace is in ", "Paris is in France"]]
-        with self.assertRaises(ValueError):
-            text_classifier(invalid_input)
-
-        # This used to be valid for doing text pairs
-        # We're keeping it working because of backward compatibility
-        outputs = text_classifier([[["HuggingFace is in ", "Paris is in France"]]])
-        self.assertEqual(
-            nested_simplify(outputs),
-            [{"label": ANY(str), "score": ANY(float)}],
-        )
-        self.assertTrue(outputs[0]["label"] in model.config.id2label.values())
diff --git a/tests/transformers/pipelines/test_pipelines_text_generation.py b/tests/transformers/pipelines/test_pipelines_text_generation.py
deleted file mode 100644
index 197469d1b..000000000
--- a/tests/transformers/pipelines/test_pipelines_text_generation.py
+++ /dev/null
@@ -1,354 +0,0 @@
-
-import unittest
-
-from mindnlp.transformers import (
-    MODEL_FOR_CAUSAL_LM_MAPPING,
-    TextGenerationPipeline,
-    pipeline,
-)
-from mindnlp.utils import logging
-from mindnlp.utils.testing_utils import is_pipeline_test, nested_simplify, require_mindspore, slow, CaptureLogger
-from .test_pipelines_common import ANY
-
-
-@is_pipeline_test
-class TextGenerationPipelineTests(unittest.TestCase):
-    model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING
-
-    @require_mindspore
-    def test_small_model(self):
-        text_generator = pipeline(task="text-generation", model="t5-small")
-        # Using `do_sample=False` to force deterministic output
-        outputs = text_generator("This is a test", do_sample=False)
-        self.assertEqual(
-            outputs,
-            [
-                {
-                    "generated_text": (
-                        "This is a test is a test test This is a test test This is "
-                        "a test"
-                    )
-                }
-            ],
-        )
-
-        outputs = text_generator(["This is a test", "This is a second test"])
-        self.assertEqual(
-            outputs,
-            [
-                [
-                    {
-                        "generated_text": (
-                        "This is a test is a test test This is a test test This is "
-                        "a test")
-                    }
-                ],
-                [
-                    {
-                        "generated_text": (
-                            "This is a second test is a second test test this is a "
-                            "second test test this is")
-                    }
-                ],
-            ],
-        )
-
-        outputs = text_generator("This is a test", do_sample=True, num_return_sequences=2, return_tensors=True)
-        self.assertEqual(
-            outputs,
-            [
-                {"generated_token_ids": ANY(list)},
-                {"generated_token_ids": ANY(list)},
-            ],
-        )
-
-        ## -- test tokenizer_kwargs
-        test_str = "testing tokenizer kwargs. using truncation must result in a different generation."
-        input_len = len(text_generator.tokenizer(test_str)["input_ids"])
-        output_str, output_str_with_truncation = (
-            text_generator(test_str, do_sample=False, return_full_text=False, min_new_tokens=1)[0]["generated_text"],
-            text_generator(
-                test_str,
-                do_sample=False,
-                return_full_text=False,
-                min_new_tokens=1,
-                truncation=True,
-                max_length=input_len + 1,
-            )[0]["generated_text"],
-        )
-        assert output_str != output_str_with_truncation  # results must be different because one had truncation
-
-        # -- what is the point of this test? padding is hardcoded False in the pipeline anyway
-        text_generator.tokenizer.pad_token_id = text_generator.model.config.eos_token_id
-        text_generator.tokenizer.pad_token = "<pad>"
-        outputs = text_generator(
-            ["This is a test", "This is a second test"],
-            do_sample=True,
-            num_return_sequences=2,
-            batch_size=2,
-            return_tensors=True,
-        )
-        self.assertEqual(
-            outputs,
-            [
-                [
-                    {"generated_token_ids": ANY(list)},
-                    {"generated_token_ids": ANY(list)},
-                ],
-                [
-                    {"generated_token_ids": ANY(list)},
-                    {"generated_token_ids": ANY(list)},
-                ],
-            ],
-        )
-
-    @require_mindspore
-    def test_small_chat_model(self):
-        text_generator = pipeline(
-            task="text-generation", model="gpt2"
-        )
-        # Using `do_sample=False` to force deterministic output
-        chat1 = [
-            {"role": "system", "content": "This is a system message."},
-            {"role": "user", "content": "This is a test"},
-            {"role": "assistant", "content": "This is a reply"},
-        ]
-        chat2 = [
-            {"role": "system", "content": "This is a system message."},
-            {"role": "user", "content": "This is a second test"},
-            {"role": "assistant", "content": "This is a reply"},
-        ]
-        outputs = text_generator(chat1, do_sample=False, max_new_tokens=10)
-        expected_chat1 = chat1 + [
-            {
-                "role": "assistant",
-                "content": "The following is a list of the most popular and",
-            }
-        ]
-        self.assertEqual(
-            outputs,
-            [
-                {"generated_text": expected_chat1},
-            ],
-        )
-
-        outputs = text_generator([chat1, chat2], do_sample=False, max_new_tokens=10)
-        expected_chat2 = chat2 + [
-            {
-                "role": "assistant",
-                "content": "The following is a list of the most popular and",
-            }
-        ]
-
-        self.assertEqual(
-            outputs,
-            [
-                [{"generated_text": expected_chat1}],
-                [{"generated_text": expected_chat2}],
-            ],
-        )
-
-
-    def get_test_pipeline(self, model, tokenizer, processor):
-        text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)
-        return text_generator, ["HuggingFace is in", "This is another test"]
-
-    def test_stop_sequence_stopping_criteria(self):
-        prompt = """Hello I believe in"""
-        text_generator = pipeline("text-generation", model="t5-small")
-        output = text_generator(prompt)
-        self.assertEqual(
-            output,
-            [{"generated_text":
-                    "Hello I believe inHello Hello Hello I believe in believe in in believe in in Hello Hello Hello Hello",
-            }]
-        )
-
-        output = text_generator(prompt, stop_sequence=" in")
-        self.assertEqual(output, [{"generated_text": "Hello I believe inHello Hello Hello I believe in"}])
-
-    def run_pipeline_test(self, text_generator, _):
-        model = text_generator.model
-        tokenizer = text_generator.tokenizer
-
-        outputs = text_generator("This is a test")
-        self.assertEqual(outputs, [{"generated_text": ANY(str)}])
-        self.assertTrue(outputs[0]["generated_text"].startswith("This is a test"))
-
-        outputs = text_generator("This is a test", return_full_text=False)
-        self.assertEqual(outputs, [{"generated_text": ANY(str)}])
-        self.assertNotIn("This is a test", outputs[0]["generated_text"])
-
-        text_generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer, return_full_text=False)
-        outputs = text_generator("This is a test")
-        self.assertEqual(outputs, [{"generated_text": ANY(str)}])
-        self.assertNotIn("This is a test", outputs[0]["generated_text"])
-
-        outputs = text_generator("This is a test", return_full_text=True)
-        self.assertEqual(outputs, [{"generated_text": ANY(str)}])
-        self.assertTrue(outputs[0]["generated_text"].startswith("This is a test"))
-
-        outputs = text_generator(["This is great !", "Something else"], num_return_sequences=2, do_sample=True)
-        self.assertEqual(
-            outputs,
-            [
-                [{"generated_text": ANY(str)}, {"generated_text": ANY(str)}],
-                [{"generated_text": ANY(str)}, {"generated_text": ANY(str)}],
-            ],
-        )
-
-        if text_generator.tokenizer.pad_token is not None:
-            outputs = text_generator(
-                ["This is great !", "Something else"], num_return_sequences=2, batch_size=2, do_sample=True
-            )
-            self.assertEqual(
-                outputs,
-                [
-                    [{"generated_text": ANY(str)}, {"generated_text": ANY(str)}],
-                    [{"generated_text": ANY(str)}, {"generated_text": ANY(str)}],
-                ],
-            )
-
-        with self.assertRaises(ValueError):
-            outputs = text_generator("test", return_full_text=True, return_text=True)
-        with self.assertRaises(ValueError):
-            outputs = text_generator("test", return_full_text=True, return_tensors=True)
-        with self.assertRaises(ValueError):
-            outputs = text_generator("test", return_text=True, return_tensors=True)
-
-        # Empty prompt is slighly special
-        # it requires BOS token to exist.
-        # Special case for Pegasus which will always append EOS so will
-        # work even without BOS.
-        if (
-            text_generator.tokenizer.bos_token_id is not None
-            or "Pegasus" in tokenizer.__class__.__name__
-            or "Git" in model.__class__.__name__
-        ):
-            outputs = text_generator("")
-            self.assertEqual(outputs, [{"generated_text": ANY(str)}])
-        else:
-            with self.assertRaises((ValueError, AssertionError)):
-                outputs = text_generator("")
-
-        if text_generator.framework == "tf":
-            # TF generation does not support max_new_tokens, and it's impossible
-            # to control long generation with only max_length without
-            # fancy calculation, dismissing tests for now.
-            return
-        # We don't care about infinite range models.
-        # They already work.
-        # Skip this test for XGLM, since it uses sinusoidal positional embeddings which are resized on-the-fly.
-        EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS = [
-            "RwkvForCausalLM",
-            "XGLMForCausalLM",
-            "GPTNeoXForCausalLM",
-            "FuyuForCausalLM",
-        ]
-        if (
-            tokenizer.model_max_length < 10000
-            and text_generator.model.__class__.__name__ not in EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS
-        ):
-            # Handling of large generations
-            with self.assertRaises((RuntimeError, IndexError, ValueError, AssertionError)):
-                text_generator("This is a test" * 500, max_new_tokens=20)
-
-            outputs = text_generator("This is a test" * 500, handle_long_generation="hole", max_new_tokens=20)
-            # Hole strategy cannot work
-            with self.assertRaises(ValueError):
-                text_generator(
-                    "This is a test" * 500,
-                    handle_long_generation="hole",
-                    max_new_tokens=tokenizer.model_max_length + 10,
-                )
-
-    @require_mindspore
-    def test_small_model_pt_bloom_accelerate(self):
-        import mindspore
-        pipe = pipeline(
-            model="hf-internal-testing/tiny-random-bloom",
-            model_kwargs={"ms_dtype": mindspore.float16},
-        )
-        self.assertEqual(pipe.model.lm_head.weight.dtype, mindspore.float16)
-        out = pipe("This is a test")
-        self.assertEqual(
-            out,
-            [
-                {
-                    "generated_text": (
-                        "This is a test test test test test test test test test test test test test test test test"
-                        " test"
-                    )
-                }
-            ],
-        )
-
-        # Upgraded those two to real pipeline arguments (they just get sent for the model as they're unlikely to mean anything else.)
-        pipe = pipeline(model="hf-internal-testing/tiny-random-bloom", ms_dtype=mindspore.float16)
-        self.assertEqual(pipe.model.lm_head.weight.dtype, mindspore.float16)
-        out = pipe("This is a test")
-        self.assertEqual(
-            out,
-            [
-                {
-                    "generated_text": (
-                        "This is a test test test test test test test test test test test test test test test test"
-                        " test"
-                    )
-                }
-            ],
-        )
-
-        pipe = pipeline(model="hf-internal-testing/tiny-random-bloom",ms_dtype=mindspore.float32)
-        self.assertEqual(pipe.model.lm_head.weight.dtype, mindspore.float32)
-        out = pipe("This is a test")
-        self.assertEqual(
-            out,
-            [
-                {
-                    "generated_text": (
-                        "This is a test test test test test test test test test test test test test test test test"
-                        " test"
-                    )
-                }
-            ],
-        )
-
-    @require_mindspore
-    def test_small_model_fp16(self):
-        import mindspore
-
-        pipe = pipeline(
-            model="hf-internal-testing/tiny-random-bloom",
-            ms_dtype=mindspore.float16,
-        )
-        pipe("This is a test")
-
-    @require_mindspore
-    def test_pipeline_accelerate_top_p(self):
-        import mindspore
-
-        pipe = pipeline(
-            model="hf-internal-testing/tiny-random-bloom", ms_dtype=mindspore.float16
-        )
-        pipe("This is a test", do_sample=True, top_p=0.5)
-
-    def test_pipeline_length_setting_warning(self):
-        prompt = """Hello world"""
-        text_generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-gpt2")
-        logger = logging.get_logger("mindnlp.transformers.generation.utils")
-        logger_msg = "Both `max_new_tokens`"  # The beggining of the message to be checked in this test
-
-        # Both are set by the user -> log warning
-        with CaptureLogger(logger) as cl:
-            _ = text_generator(prompt, max_length=10, max_new_tokens=1)
-        self.assertIn(logger_msg, cl.out)
-
-        # The user only sets one -> no warning
-        with CaptureLogger(logger) as cl:
-            _ = text_generator(prompt, max_new_tokens=1)
-        self.assertNotIn(logger_msg, cl.out)
-
-        with CaptureLogger(logger) as cl:
-            _ = text_generator(prompt, max_length=10)
-        self.assertNotIn(logger_msg, cl.out)
\ No newline at end of file
diff --git a/tests/transformers/pipelines/test_pipelines_zero_shot_classification.py b/tests/transformers/pipelines/test_pipelines_zero_shot_classification.py
deleted file mode 100644
index 79607579f..000000000
--- a/tests/transformers/pipelines/test_pipelines_zero_shot_classification.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-# pylint: disable=missing-function-docstring
-# pylint: disable=missing-class-docstring
-# pylint: disable=no-else-return
-# pylint: disable=arguments-renamed
-import unittest
-
-from mindnlp.transformers import (
-    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
-    Pipeline,
-    ZeroShotClassificationPipeline,
-    pipeline,
-)
-from mindnlp.utils.testing_utils import is_pipeline_test, nested_simplify, require_mindspore, slow
-
-from .test_pipelines_common import ANY
-
-
-# These 2 model types require different inputs than those of the usual text models.
-_TO_SKIP = {"LayoutLMv2Config", "LayoutLMv3Config"}
-
-
-@is_pipeline_test
-class ZeroShotClassificationPipelineTests(unittest.TestCase):
-    model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
-
-    if model_mapping is not None:
-        model_mapping = {config: model for config, model in model_mapping.items() if config.__name__ not in _TO_SKIP}
-
-    def get_test_pipeline(self, model, tokenizer, processor):
-        classifier = ZeroShotClassificationPipeline(
-            model=model, tokenizer=tokenizer, candidate_labels=["polics", "health"]
-        )
-        return classifier, ["Who are you voting for in 2020?", "My stomach hurts."]
-
-    def run_pipeline_test(self, classifier, _):
-        outputs = classifier("Who are you voting for in 2020?", candidate_labels="politics")
-        self.assertEqual(outputs, {"sequence": ANY(str), "labels": [ANY(str)], "scores": [ANY(float)]})
-
-        # No kwarg
-        outputs = classifier("Who are you voting for in 2020?", ["politics"])
-        self.assertEqual(outputs, {"sequence": ANY(str), "labels": [ANY(str)], "scores": [ANY(float)]})
-
-        outputs = classifier("Who are you voting for in 2020?", candidate_labels=["politics"])
-        self.assertEqual(outputs, {"sequence": ANY(str), "labels": [ANY(str)], "scores": [ANY(float)]})
-
-        outputs = classifier("Who are you voting for in 2020?", candidate_labels="politics, public health")
-        self.assertEqual(
-            outputs, {"sequence": ANY(str), "labels": [ANY(str), ANY(str)], "scores": [ANY(float), ANY(float)]}
-        )
-        self.assertAlmostEqual(sum(nested_simplify(outputs["scores"])), 1.0)
-
-        outputs = classifier("Who are you voting for in 2020?", candidate_labels=["politics", "public health"])
-        self.assertEqual(
-            outputs, {"sequence": ANY(str), "labels": [ANY(str), ANY(str)], "scores": [ANY(float), ANY(float)]}
-        )
-        self.assertAlmostEqual(sum(nested_simplify(outputs["scores"])), 1.0)
-
-        outputs = classifier(
-            "Who are you voting for in 2020?", candidate_labels="politics", hypothesis_template="This text is about {}"
-        )
-        self.assertEqual(outputs, {"sequence": ANY(str), "labels": [ANY(str)], "scores": [ANY(float)]})
-
-        # https://github.com/huggingface/transformers/issues/13846
-        outputs = classifier(["I am happy"], ["positive", "negative"])
-        self.assertEqual(
-            outputs,
-            [
-                {"sequence": ANY(str), "labels": [ANY(str), ANY(str)], "scores": [ANY(float), ANY(float)]}
-                for i in range(1)
-            ],
-        )
-        outputs = classifier(["I am happy", "I am sad"], ["positive", "negative"])
-        self.assertEqual(
-            outputs,
-            [
-                {"sequence": ANY(str), "labels": [ANY(str), ANY(str)], "scores": [ANY(float), ANY(float)]}
-                for i in range(2)
-            ],
-        )
-
-        with self.assertRaises(ValueError):
-            classifier("", candidate_labels="politics")
-
-        with self.assertRaises(TypeError):
-            classifier(None, candidate_labels="politics")
-
-        with self.assertRaises(ValueError):
-            classifier("Who are you voting for in 2020?", candidate_labels="")
-
-        with self.assertRaises(TypeError):
-            classifier("Who are you voting for in 2020?", candidate_labels=None)
-
-        with self.assertRaises(ValueError):
-            classifier(
-                "Who are you voting for in 2020?",
-                candidate_labels="politics",
-                hypothesis_template="Not formatting template",
-            )
-
-        with self.assertRaises(AttributeError):
-            classifier(
-                "Who are you voting for in 2020?",
-                candidate_labels="politics",
-                hypothesis_template=None,
-            )
-
-        self.run_entailment_id(classifier)
-
-    def run_entailment_id(self, zero_shot_classifier: Pipeline):
-        config = zero_shot_classifier.model.config
-        original_label2id = config.label2id
-        original_entailment = zero_shot_classifier.entailment_id
-
-        config.label2id = {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2}
-        self.assertEqual(zero_shot_classifier.entailment_id, -1)
-
-        config.label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}
-        self.assertEqual(zero_shot_classifier.entailment_id, 0)
-
-        config.label2id = {"ENTAIL": 0, "NON-ENTAIL": 1}
-        self.assertEqual(zero_shot_classifier.entailment_id, 0)
-
-        config.label2id = {"ENTAIL": 2, "NEUTRAL": 1, "CONTR": 0}
-        self.assertEqual(zero_shot_classifier.entailment_id, 2)
-
-        zero_shot_classifier.model.config.label2id = original_label2id
-        self.assertEqual(original_entailment, zero_shot_classifier.entailment_id)
-
-    @require_mindspore
-    def test_truncation(self):
-        zero_shot_classifier = pipeline(
-            "zero-shot-classification",
-            model="sshleifer/tiny-distilbert-base-cased-distilled-squad",
-        )
-        # There was a regression in 4.10 for this
-        # Adding a test so we don't make the mistake again.
-        # https://github.com/huggingface/transformers/issues/13381#issuecomment-912343499
-        zero_shot_classifier(
-            "Who are you voting for in 2020?" * 100, candidate_labels=["politics", "public health", "science"]
-        )
-
-    @require_mindspore
-    def test_small_model_ms(self):
-        zero_shot_classifier = pipeline(
-            "zero-shot-classification",
-            model="sshleifer/tiny-distilbert-base-cased-distilled-squad",
-        )
-        outputs = zero_shot_classifier(
-            "Who are you voting for in 2020?", candidate_labels=["politics", "public health", "science"]
-        )
-
-        self.assertEqual(
-            nested_simplify(outputs),
-            {
-                "sequence": "Who are you voting for in 2020?",
-                "labels": ["science", "public health", "politics"],
-                "scores": [0.333, 0.333, 0.333],
-            },
-        )
-
-
-
-    @slow
-    @require_mindspore
-    def test_large_model_ms(self):
-        zero_shot_classifier = pipeline(
-            "zero-shot-classification", model="FacebookAI/roberta-large-mnli", framework="ms"
-        )
-        outputs = zero_shot_classifier(
-            "Who are you voting for in 2020?", candidate_labels=["politics", "public health", "science"]
-        )
-
-        self.assertEqual(
-            nested_simplify(outputs),
-            {
-                "sequence": "Who are you voting for in 2020?",
-                "labels": ["politics", "public health", "science"],
-                "scores": [0.976, 0.015, 0.009],
-            },
-        )
-        outputs = zero_shot_classifier(
-            "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks"
-            " in an encoder-decoder configuration. The best performing models also connect the encoder and decoder"
-            " through an attention mechanism. We propose a new simple network architecture, the Transformer, based"
-            " solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two"
-            " machine translation tasks show these models to be superior in quality while being more parallelizable"
-            " and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014"
-            " English-to-German translation task, improving over the existing best results, including ensembles by"
-            " over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new"
-            " single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small"
-            " fraction of the training costs of the best models from the literature. We show that the Transformer"
-            " generalizes well to other tasks by applying it successfully to English constituency parsing both with"
-            " large and limited training data.",
-            candidate_labels=["machine learning", "statistics", "translation", "vision"],
-            multi_label=True,
-        )
-        self.assertEqual(
-            nested_simplify(outputs),
-            {
-                "sequence": (
-                    "The dominant sequence transduction models are based on complex recurrent or convolutional neural"
-                    " networks in an encoder-decoder configuration. The best performing models also connect the"
-                    " encoder and decoder through an attention mechanism. We propose a new simple network"
-                    " architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence"
-                    " and convolutions entirely. Experiments on two machine translation tasks show these models to be"
-                    " superior in quality while being more parallelizable and requiring significantly less time to"
-                    " train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task,"
-                    " improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014"
-                    " English-to-French translation task, our model establishes a new single-model state-of-the-art"
-                    " BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training"
-                    " costs of the best models from the literature. We show that the Transformer generalizes well to"
-                    " other tasks by applying it successfully to English constituency parsing both with large and"
-                    " limited training data."
-                ),
-                "labels": ["translation", "machine learning", "vision", "statistics"],
-                "scores": [0.817, 0.713, 0.018, 0.018],
-            },
-        )
-        
\ No newline at end of file
diff --git a/tests/transformers/test_backbone_common.py b/tests/transformers/test_backbone_common.py
deleted file mode 100644
index f0d199b78..000000000
--- a/tests/transformers/test_backbone_common.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import inspect
-import tempfile
-
-from mindnlp.utils.testing_utils import require_mindspore
-from mindnlp.utils.backbone_utils import BackboneType
-
-
-@require_mindspore
-class BackboneTesterMixin:
-    all_model_classes = ()
-    has_attentions = True
-
-    def test_config(self):
-        config_class = self.config_class
-
-        # test default config
-        config = config_class()
-        self.assertIsNotNone(config)
-        num_stages = len(config.depths) if hasattr(config, "depths") else config.num_hidden_layers
-        expected_stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_stages + 1)]
-        self.assertEqual(config.stage_names, expected_stage_names)
-        self.assertTrue(set(config.out_features).issubset(set(config.stage_names)))
-
-        # Test out_features and out_indices are correctly set
-        # out_features and out_indices both None
-        config = config_class(out_features=None, out_indices=None)
-        self.assertEqual(config.out_features, [config.stage_names[-1]])
-        self.assertEqual(config.out_indices, [len(config.stage_names) - 1])
-
-        # out_features and out_indices both set
-        config = config_class(out_features=["stem", "stage1"], out_indices=[0, 1])
-        self.assertEqual(config.out_features, ["stem", "stage1"])
-        self.assertEqual(config.out_indices, [0, 1])
-
-        # Only out_features set
-        config = config_class(out_features=["stage1", "stage3"])
-        self.assertEqual(config.out_features, ["stage1", "stage3"])
-        self.assertEqual(config.out_indices, [1, 3])
-
-        # Only out_indices set
-        config = config_class(out_indices=[0, 2])
-        self.assertEqual(config.out_features, [config.stage_names[0], config.stage_names[2]])
-        self.assertEqual(config.out_indices, [0, 2])
-
-        # Error raised when out_indices do not correspond to out_features
-        with self.assertRaises(ValueError):
-            config = config_class(out_features=["stage1", "stage2"], out_indices=[0, 2])
-
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
-    def test_config_save_pretrained(self):
-        config_class = self.config_class
-        config_first = config_class(out_indices=[0, 1, 2, 3])
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            config_first.save_pretrained(tmpdirname)
-            config_second = self.config_class.from_pretrained(tmpdirname)
-
-        self.assertEqual(config_second.to_dict(), config_first.to_dict())
-
-    def test_channels(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertEqual(len(model.channels), len(config.out_features))
-            num_features = model.num_features
-            out_indices = [config.stage_names.index(feat) for feat in config.out_features]
-            out_channels = [num_features[idx] for idx in out_indices]
-            self.assertListEqual(model.channels, out_channels)
-
-            new_config = copy.deepcopy(config)
-            new_config.out_features = None
-            model = model_class(new_config)
-            self.assertEqual(len(model.channels), 1)
-            self.assertListEqual(model.channels, [num_features[-1]])
-
-            new_config = copy.deepcopy(config)
-            new_config.out_indices = None
-            model = model_class(new_config)
-            self.assertEqual(len(model.channels), 1)
-            self.assertListEqual(model.channels, [num_features[-1]])
-
-    def test_create_from_modified_config(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-
-            model.set_train(False)
-            result = model(**inputs_dict)
-
-            self.assertEqual(len(result.feature_maps), len(config.out_features))
-            self.assertEqual(len(model.channels), len(config.out_features))
-            self.assertEqual(len(result.feature_maps), len(config.out_indices))
-            self.assertEqual(len(model.channels), len(config.out_indices))
-
-            # Check output of last stage is taken if out_features=None, out_indices=None
-            modified_config = copy.deepcopy(config)
-            modified_config.out_features = None
-            model = model_class(modified_config)
-
-            model.set_train(False)
-            result = model(**inputs_dict)
-
-            self.assertEqual(len(result.feature_maps), 1)
-            self.assertEqual(len(model.channels), 1)
-
-            modified_config = copy.deepcopy(config)
-            modified_config.out_indices = None
-            model = model_class(modified_config)
-
-            model.set_train(False)
-            result = model(**inputs_dict)
-
-            self.assertEqual(len(result.feature_maps), 1)
-            self.assertEqual(len(model.channels), 1)
-
-            # Check backbone can be initialized with fresh weights
-            modified_config = copy.deepcopy(config)
-            modified_config.use_pretrained_backbone = False
-            model = model_class(modified_config)
-
-            model.set_train(False)
-            result = model(**inputs_dict)
-
-    def test_backbone_common_attributes(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for backbone_class in self.all_model_classes:
-            backbone = backbone_class(config)
-
-            self.assertTrue(hasattr(backbone, "backbone_type"))
-            self.assertTrue(hasattr(backbone, "stage_names"))
-            self.assertTrue(hasattr(backbone, "num_features"))
-            self.assertTrue(hasattr(backbone, "out_indices"))
-            self.assertTrue(hasattr(backbone, "out_features"))
-            self.assertTrue(hasattr(backbone, "out_feature_channels"))
-            self.assertTrue(hasattr(backbone, "channels"))
-
-            self.assertIsInstance(backbone.backbone_type, BackboneType)
-            # Verify num_features has been initialized in the backbone init
-            self.assertIsNotNone(backbone.num_features)
-            self.assertTrue(len(backbone.channels) == len(backbone.out_indices))
-            self.assertTrue(len(backbone.stage_names) == len(backbone.num_features))
-            self.assertTrue(len(backbone.channels) <= len(backbone.num_features))
-            self.assertTrue(len(backbone.out_feature_channels) == len(backbone.stage_names))
-
-    def test_backbone_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        batch_size = inputs_dict["pixel_values"].shape[0]
-
-        for backbone_class in self.all_model_classes:
-            backbone = backbone_class(config)
-
-            backbone.set_train(False)
-
-            outputs = backbone(**inputs_dict)
-
-            # Test default outputs and verify feature maps
-            self.assertIsInstance(outputs.feature_maps, tuple)
-            self.assertTrue(len(outputs.feature_maps) == len(backbone.channels))
-            for feature_map, n_channels in zip(outputs.feature_maps, backbone.channels):
-                self.assertTrue(feature_map.shape[:2], (batch_size, n_channels))
-            self.assertIsNone(outputs.hidden_states)
-            self.assertIsNone(outputs.attentions)
-
-            # Test output_hidden_states=True
-            outputs = backbone(**inputs_dict, output_hidden_states=True)
-            self.assertIsNotNone(outputs.hidden_states)
-            self.assertTrue(len(outputs.hidden_states), len(backbone.stage_names))
-            for hidden_state, n_channels in zip(outputs.hidden_states, backbone.channels):
-                self.assertTrue(hidden_state.shape[:2], (batch_size, n_channels))
-
-            # Test output_attentions=True
-            if self.has_attentions:
-                outputs = backbone(**inputs_dict, output_attentions=True)
-                self.assertIsNotNone(outputs.attentions)
-
-    def test_backbone_stage_selection(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        batch_size = inputs_dict["pixel_values"].shape[0]
-
-        for backbone_class in self.all_model_classes:
-            config.out_indices = [-2, -1]
-            backbone = backbone_class(config)
-
-            backbone.set_train(False)
-
-            outputs = backbone(**inputs_dict)
-
-            # Test number of feature maps returned
-            self.assertIsInstance(outputs.feature_maps, tuple)
-            self.assertTrue(len(outputs.feature_maps) == 2)
-
-            # Order of channels returned is same as order of channels iterating over stage names
-            channels_from_stage_names = [
-                backbone.out_feature_channels[name] for name in backbone.stage_names if name in backbone.out_features
-            ]
-            self.assertEqual(backbone.channels, channels_from_stage_names)
-            for feature_map, n_channels in zip(outputs.feature_maps, backbone.channels):
-                self.assertTrue(feature_map.shape[:2], (batch_size, n_channels))
diff --git a/tests/transformers/test_configuration_common.py b/tests/transformers/test_configuration_common.py
deleted file mode 100644
index 2fc0c2dcd..000000000
--- a/tests/transformers/test_configuration_common.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import json
-import os
-import tempfile
-
-from mindnlp.utils import is_mindspore_available
-
-config_common_kwargs = {
-    "return_dict": False,
-    "output_hidden_states": True,
-    "output_attentions": True,
-    "ms_dtype": "float16",
-    # "use_bfloat16": True,
-    "pruned_heads": {"a": 1},
-    "tie_word_embeddings": False,
-    "is_decoder": True,
-    "cross_attention_hidden_size": 128,
-    "add_cross_attention": True,
-    "tie_encoder_decoder": True,
-    "max_length": 50,
-    "min_length": 3,
-    "do_sample": True,
-    "early_stopping": True,
-    "num_beams": 3,
-    "num_beam_groups": 3,
-    "diversity_penalty": 0.5,
-    "temperature": 2.0,
-    "top_k": 10,
-    "top_p": 0.7,
-    "typical_p": 0.2,
-    "repetition_penalty": 0.8,
-    "length_penalty": 0.8,
-    "no_repeat_ngram_size": 5,
-    "encoder_no_repeat_ngram_size": 5,
-    "bad_words_ids": [1, 2, 3],
-    "num_return_sequences": 3,
-    "chunk_size_feed_forward": 5,
-    "output_scores": True,
-    "return_dict_in_generate": True,
-    "forced_bos_token_id": 2,
-    "forced_eos_token_id": 3,
-    "remove_invalid_values": True,
-    "architectures": ["BertModel"],
-    "finetuning_task": "translation",
-    "id2label": {0: "label"},
-    "label2id": {"label": "0"},
-    "tokenizer_class": "BertTokenizerFast",
-    "prefix": "prefix",
-    "bos_token_id": 6,
-    "pad_token_id": 7,
-    "eos_token_id": 8,
-    "sep_token_id": 9,
-    "decoder_start_token_id": 10,
-    "exponential_decay_length_penalty": (5, 1.01),
-    "suppress_tokens": [0, 1],
-    "begin_suppress_tokens": 2,
-    "task_specific_params": {"translation": "some_params"},
-    "problem_type": "regression",
-}
-
-class ConfigTester(object):
-    def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs):
-        self.parent = parent
-        self.config_class = config_class
-        self.has_text_modality = has_text_modality
-        self.inputs_dict = kwargs
-        self.common_properties = common_properties
-
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        common_properties = (
-            ["hidden_size", "num_attention_heads", "num_hidden_layers"]
-            if self.common_properties is None
-            else self.common_properties
-        )
-
-        # Add common fields for text models
-        if self.has_text_modality:
-            common_properties.extend(["vocab_size"])
-
-        # Test that config has the common properties as getters
-        for prop in common_properties:
-            self.parent.assertTrue(hasattr(config, prop), msg=f"`{prop}` does not exist")
-
-        # Test that config has the common properties as setter
-        for idx, name in enumerate(common_properties):
-            try:
-                setattr(config, name, idx)
-                self.parent.assertEqual(
-                    getattr(config, name), idx, msg=f"`{name} value {idx} expected, but was {getattr(config, name)}"
-                )
-            except NotImplementedError:
-                # Some models might not be able to implement setters for common_properties
-                # In that case, a NotImplementedError is raised
-                pass
-
-        # Test if config class can be called with Config(prop_name=..)
-        for idx, name in enumerate(common_properties):
-            try:
-                config = self.config_class(**{name: idx})
-                self.parent.assertEqual(
-                    getattr(config, name), idx, msg=f"`{name} value {idx} expected, but was {getattr(config, name)}"
-                )
-            except NotImplementedError:
-                # Some models might not be able to implement setters for common_properties
-                # In that case, a NotImplementedError is raised
-                pass
-
-    def create_and_test_config_to_json_string(self):
-        config = self.config_class(**self.inputs_dict)
-        obj = json.loads(config.to_json_string())
-        for key, value in self.inputs_dict.items():
-            self.parent.assertEqual(obj[key], value)
-
-    def create_and_test_config_to_json_file(self):
-        config_first = self.config_class(**self.inputs_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "config.json")
-            config_first.to_json_file(json_file_path)
-            config_second = self.config_class.from_json_file(json_file_path)
-
-        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
-
-    def create_and_test_config_from_and_save_pretrained(self):
-        config_first = self.config_class(**self.inputs_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            config_first.save_pretrained(tmpdirname)
-            config_second = self.config_class.from_pretrained(tmpdirname)
-
-        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
-
-    def create_and_test_config_from_and_save_pretrained_subfolder(self):
-        config_first = self.config_class(**self.inputs_dict)
-
-        subfolder = "test"
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            sub_tmpdirname = os.path.join(tmpdirname, subfolder)
-            config_first.save_pretrained(sub_tmpdirname)
-            config_second = self.config_class.from_pretrained(tmpdirname, subfolder=subfolder)
-
-        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
-
-    def create_and_test_config_with_num_labels(self):
-        config = self.config_class(**self.inputs_dict, num_labels=5)
-        self.parent.assertEqual(len(config.id2label), 5)
-        self.parent.assertEqual(len(config.label2id), 5)
-
-        config.num_labels = 3
-        self.parent.assertEqual(len(config.id2label), 3)
-        self.parent.assertEqual(len(config.label2id), 3)
-
-    def check_config_can_be_init_without_params(self):
-        if self.config_class.is_composition:
-            with self.parent.assertRaises(ValueError):
-                config = self.config_class()
-        else:
-            config = self.config_class()
-            self.parent.assertIsNotNone(config)
-
-    def check_config_arguments_init(self):
-        kwargs = copy.deepcopy(config_common_kwargs)
-        config = self.config_class(**kwargs)
-        wrong_values = []
-        for key, value in config_common_kwargs.items():
-            if key == "ms_dtype":
-                if not is_mindspore_available():
-                    continue
-                else:
-                    import mindspore
-
-                    if config.ms_dtype != mindspore.float16:
-                        wrong_values.append(("ms_dtype", config.ms_dtype, str(mindspore.float16).lower()))
-            elif getattr(config, key) != value:
-                wrong_values.append((key, getattr(config, key), value))
-
-        if len(wrong_values) > 0:
-            errors = "\n".join([f"- {v[0]}: got {v[1]} instead of {v[2]}" for v in wrong_values])
-            raise ValueError(f"The following keys were not properly set in the config:\n{errors}")
-
-    def run_common_tests(self):
-        self.create_and_test_config_common_properties()
-        self.create_and_test_config_to_json_string()
-        self.create_and_test_config_to_json_file()
-        self.create_and_test_config_from_and_save_pretrained()
-        self.create_and_test_config_from_and_save_pretrained_subfolder()
-        self.create_and_test_config_with_num_labels()
-        self.check_config_can_be_init_without_params()
-        self.check_config_arguments_init()
\ No newline at end of file
diff --git a/tests/transformers/test_feature_extraction_common.py b/tests/transformers/test_feature_extraction_common.py
deleted file mode 100644
index c9b707cca..000000000
--- a/tests/transformers/test_feature_extraction_common.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import tempfile
-
-from mindnlp.utils.testing_utils import check_json_file_has_correct_format
-
-
-class FeatureExtractionSavingTestMixin:
-    test_cast_dtype = None
-
-    def test_feat_extract_to_json_string(self):
-        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
-        obj = json.loads(feat_extract.to_json_string())
-        for key, value in self.feat_extract_dict.items():
-            self.assertEqual(obj[key], value)
-
-    def test_feat_extract_to_json_file(self):
-        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
-            feat_extract_first.to_json_file(json_file_path)
-            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
-
-        self.assertEqual(feat_extract_second.to_dict(), feat_extract_first.to_dict())
-
-    def test_feat_extract_from_and_save_pretrained(self):
-        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
-            check_json_file_has_correct_format(saved_file)
-            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
-
-        self.assertEqual(feat_extract_second.to_dict(), feat_extract_first.to_dict())
-
-    def test_init_without_params(self):
-        feat_extract = self.feature_extraction_class()
-        self.assertIsNotNone(feat_extract)
diff --git a/tests/transformers/test_image_processing_common.py b/tests/transformers/test_image_processing_common.py
deleted file mode 100644
index 94ce11851..000000000
--- a/tests/transformers/test_image_processing_common.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import json
-import os
-import pathlib
-import tempfile
-
-from mindnlp.transformers.feature_extraction_utils import BatchFeature
-from mindnlp.transformers.image_utils import AnnotationFormat, AnnotionFormat
-from mindnlp.utils.testing_utils import check_json_file_has_correct_format, require_mindspore, require_vision, get_tests_dir
-from mindnlp.utils import is_mindspore_available, is_vision_available
-
-
-if is_mindspore_available():
-    import numpy as np
-    import mindspore as ms
-
-if is_vision_available():
-    from PIL import Image
-
-
-def prepare_image_inputs(
-    batch_size,
-    min_resolution,
-    max_resolution,
-    num_channels,
-    size_divisor=None,
-    equal_resolution=False,
-    numpify=False,
-    torchify=False,
-):
-    """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
-    or a list of PyTorch tensors if one specifies torchify=True.
-
-    One can specify whether the images are of the same resolution or not.
-    """
-
-    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-
-    image_inputs = []
-    for i in range(batch_size):
-        if equal_resolution:
-            width = height = max_resolution
-        else:
-            # To avoid getting image width/height 0
-            if size_divisor is not None:
-                # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
-                min_resolution = max(size_divisor, min_resolution)
-            width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
-        image_inputs.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
-
-    if not numpify and not torchify:
-        # PIL expects the channel dimension as last dimension
-        image_inputs = [Image.fromarray(np.moveaxis(image, 0, -1)) for image in image_inputs]
-
-    if torchify:
-        image_inputs = [ms.Tensor(image) for image in image_inputs]
-
-    return image_inputs
-
-
-def prepare_video(num_frames, num_channels, width=10, height=10, numpify=False, torchify=False):
-    """This function prepares a video as a list of PIL images/NumPy arrays/PyTorch tensors."""
-
-    video = []
-    for i in range(num_frames):
-        video.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
-
-    if not numpify and not torchify:
-        # PIL expects the channel dimension as last dimension
-        video = [Image.fromarray(np.moveaxis(frame, 0, -1)) for frame in video]
-
-    if torchify:
-        video = [ms.Tensor(frame) for frame in video]
-
-    return video
-
-
-def prepare_video_inputs(
-    batch_size,
-    num_frames,
-    num_channels,
-    min_resolution,
-    max_resolution,
-    equal_resolution=False,
-    numpify=False,
-    torchify=False,
-):
-    """This function prepares a batch of videos: a list of list of PIL images, or a list of list of numpy arrays if
-    one specifies numpify=True, or a list of list of PyTorch tensors if one specifies torchify=True.
-
-    One can specify whether the videos are of the same resolution or not.
-    """
-
-    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
-
-    video_inputs = []
-    for i in range(batch_size):
-        if equal_resolution:
-            width = height = max_resolution
-        else:
-            width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
-            video = prepare_video(
-                num_frames=num_frames,
-                num_channels=num_channels,
-                width=width,
-                height=height,
-                numpify=numpify,
-                torchify=torchify,
-            )
-        video_inputs.append(video)
-
-    return video_inputs
-
-
-class ImageProcessingTestMixin:
-    test_cast_dtype = None
-
-    def test_image_processor_to_json_string(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        obj = json.loads(image_processor.to_json_string())
-        for key, value in self.image_processor_dict.items():
-            self.assertEqual(obj[key], value)
-
-    def test_image_processor_to_json_file(self):
-        image_processor_first = self.image_processing_class(**self.image_processor_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            json_file_path = os.path.join(tmpdirname, "image_processor.json")
-            image_processor_first.to_json_file(json_file_path)
-            image_processor_second = self.image_processing_class.from_json_file(json_file_path)
-
-        self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
-
-    def test_image_processor_from_and_save_pretrained(self):
-        image_processor_first = self.image_processing_class(**self.image_processor_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            saved_file = image_processor_first.save_pretrained(tmpdirname)[0]
-            check_json_file_has_correct_format(saved_file)
-            image_processor_second = self.image_processing_class.from_pretrained(tmpdirname)
-
-        self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
-
-    def test_init_without_params(self):
-        image_processor = self.image_processing_class()
-        self.assertIsNotNone(image_processor)
-
-    @require_mindspore
-    @require_vision
-    def test_cast_dtype_device(self):
-        if self.test_cast_dtype is not None:
-            # Initialize image_processor
-            image_processor = self.image_processing_class(**self.image_processor_dict)
-
-            # create random PyTorch tensors
-            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
-
-            encoding = image_processor(image_inputs, return_tensors="ms")
-            # for layoutLM compatiblity
-            self.assertEqual(encoding.pixel_values.dtype, ms.float32)
-
-            encoding = image_processor(image_inputs, return_tensors="ms").to(ms.float16)
-            self.assertEqual(encoding.pixel_values.dtype, ms.float16)
-
-            # Try with text + image feature
-            encoding = image_processor(image_inputs, return_tensors="ms")
-            encoding.update({"input_ids": ms.Tensor([[1, 2, 3], [4, 5, 6]])})
-            encoding = encoding.to(ms.float16)
-
-            self.assertEqual(encoding.pixel_values.dtype, ms.float16)
-            self.assertEqual(encoding.input_ids.dtype, ms.int64)
-
-    def test_call_pil(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PIL images
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
-        for image in image_inputs:
-            self.assertIsInstance(image, Image.Image)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="ms").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="ms").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-        self.assertEqual(
-            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
-        )
-
-    def test_call_numpy(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random numpy tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-        for image in image_inputs:
-            self.assertIsInstance(image, np.ndarray)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="ms").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-        # Test batched
-        encoded_images = image_processing(image_inputs, return_tensors="ms").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-        self.assertEqual(
-            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
-        )
-
-    def test_call_pytorch(self):
-        # Initialize image_processing
-        image_processing = self.image_processing_class(**self.image_processor_dict)
-        # create random PyTorch tensors
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
-
-        for image in image_inputs:
-            self.assertIsInstance(image, ms.Tensor)
-
-        # Test not batched input
-        encoded_images = image_processing(image_inputs[0], return_tensors="ms").pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-        # Test batched
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-        encoded_images = image_processing(image_inputs, return_tensors="ms").pixel_values
-        self.assertEqual(
-            tuple(encoded_images.shape),
-            (self.image_processor_tester.batch_size, *expected_output_image_shape),
-        )
-
-    def test_call_numpy_4_channels(self):
-        # Test that can process images which have an arbitrary number of channels
-        # Initialize image_processing
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-
-        # create random numpy tensors
-        self.image_processor_tester.num_channels = 4
-        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
-
-        # Test not batched input
-        encoded_images = image_processor(
-            image_inputs[0],
-            return_tensors="ms",
-            input_data_format="channels_first",
-            image_mean=0,
-            image_std=1,
-        ).pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
-        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
-
-        # Test batched
-        encoded_images = image_processor(
-            image_inputs,
-            return_tensors="ms",
-            input_data_format="channels_first",
-            image_mean=0,
-            image_std=1,
-        ).pixel_values
-        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
-        self.assertEqual(
-            tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
-        )
-
-    def test_image_processor_preprocess_arguments(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        if hasattr(image_processor, "_valid_processor_keys") and hasattr(image_processor, "preprocess"):
-            preprocess_parameter_names = inspect.getfullargspec(image_processor.preprocess).args
-            preprocess_parameter_names.remove("self")
-            preprocess_parameter_names.sort()
-            valid_processor_keys = image_processor._valid_processor_keys
-            valid_processor_keys.sort()
-            self.assertEqual(preprocess_parameter_names, valid_processor_keys)
-
-
-class AnnotationFormatTestMixin:
-    # this mixin adds a test to assert that usages of the
-    # to-be-deprecated `AnnotionFormat` continue to be
-    # supported for the time being
-
-    def test_processor_can_use_legacy_annotation_format(self):
-        image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
-        fixtures_path = pathlib.Path(get_tests_dir()) / "fixtures" / "tests_samples" / "COCO"
-
-        with open(fixtures_path / "coco_annotations.txt", "r") as f:
-            detection_target = json.loads(f.read())
-
-        detection_annotations = {"image_id": 39769, "annotations": detection_target}
-
-        detection_params = {
-            "images": Image.open(fixtures_path / "000000039769.png"),
-            "annotations": detection_annotations,
-            "return_tensors": "ms",
-        }
-
-        with open(fixtures_path / "coco_panoptic_annotations.txt", "r") as f:
-            panoptic_target = json.loads(f.read())
-
-        panoptic_annotations = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": panoptic_target}
-
-        masks_path = pathlib.Path(fixtures_path / "coco_panoptic")
-
-        panoptic_params = {
-            "images": Image.open(fixtures_path / "000000039769.png"),
-            "annotations": panoptic_annotations,
-            "return_tensors": "ms",
-            "masks_path": masks_path,
-        }
-
-        test_cases = [
-            ("coco_detection", detection_params),
-            ("coco_panoptic", panoptic_params),
-            (AnnotionFormat.COCO_DETECTION, detection_params),
-            (AnnotionFormat.COCO_PANOPTIC, panoptic_params),
-            (AnnotationFormat.COCO_DETECTION, detection_params),
-            (AnnotationFormat.COCO_PANOPTIC, panoptic_params),
-        ]
-
-        def _compare(a, b) -> None:
-            if isinstance(a, (dict, BatchFeature)):
-                self.assertEqual(a.keys(), b.keys())
-                for k, v in a.items():
-                    _compare(v, b[k])
-            elif isinstance(a, list):
-                self.assertEqual(len(a), len(b))
-                for idx in range(len(a)):
-                    _compare(a[idx], b[idx])
-            elif isinstance(a, ms.Tensor):
-                self.assertTrue(np.allclose(a.asnumpy(), b.asnumpy(), atol=1e-3))
-            elif isinstance(a, str):
-                self.assertEqual(a, b)
-
-        for annotation_format, params in test_cases:
-            with self.subTest(annotation_format):
-                image_processor_params = {**image_processor_dict, **{"format": annotation_format}}
-                image_processor_first = self.image_processing_class(**image_processor_params)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    image_processor_first.save_pretrained(tmpdirname)
-                    image_processor_second = self.image_processing_class.from_pretrained(tmpdirname)
-
-                # check the 'format' key exists and that the dicts of the
-                # first and second processors are equal
-                self.assertIn("format", image_processor_first.to_dict().keys())
-                self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
-
-                # perform encoding using both processors and compare
-                # the resulting BatchFeatures
-                first_encoding = image_processor_first(**params)
-                second_encoding = image_processor_second(**params)
-                _compare(first_encoding, second_encoding)
diff --git a/tests/transformers/test_modeling_common.py b/tests/transformers/test_modeling_common.py
deleted file mode 100644
index 504e41d1e..000000000
--- a/tests/transformers/test_modeling_common.py
+++ /dev/null
@@ -1,3268 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import copy
-import gc
-import inspect
-import math
-import os
-import os.path
-import random
-import re
-import tempfile
-import time
-import warnings
-from collections import defaultdict
-from typing import Dict, List, Tuple
-import unittest
-
-import numpy as np
-from packaging import version
-from parameterized import parameterized
-from pytest import mark
-
-from mindnlp.transformers import (
-    AutoModel,
-    AutoModelForCausalLM,
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    GenerationConfig,
-    PretrainedConfig,
-    PreTrainedModel,
-    logging,
-)
-from mindnlp.engine import set_seed
-from mindnlp.core import no_grad, optim, value_and_grad
-from mindnlp.core.serialization import save_checkpoint, load_checkpoint
-from mindnlp.transformers.models.auto import get_values
-from mindnlp.transformers.models.auto.modeling_auto import (
-    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES,
-    MODEL_FOR_BACKBONE_MAPPING_NAMES,
-    MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES,
-    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
-    MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
-    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
-    MODEL_FOR_MASKED_LM_MAPPING_NAMES,
-    MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
-    MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES,
-    MODEL_FOR_PRETRAINING_MAPPING_NAMES,
-    MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
-    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
-    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
-    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES,
-    MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES,
-    MODEL_MAPPING_NAMES,
-)
-from mindnlp.utils.testing_utils import (
-    is_mindspore_available,
-    CaptureLogger,
-    is_flaky,
-    # require_accelerate,
-    # require_bitsandbytes,
-    # require_flash_attn,
-    # require_read_token,
-    require_safetensors,
-    require_mindspore,
-    slow,
-)
-from mindnlp.transformers.utils import CONFIG_NAME, GENERATION_CONFIG_NAME, SAFE_WEIGHTS_NAME
-from mindnlp.core.configs import ON_ORANGE_PI
-from mindnlp.utils.generic import ContextManagers, ModelOutput
-from transformers.pytorch_utils import id_tensor_storage
-
-# if is_accelerate_available():
-#     from accelerate.utils import compute_module_sizes
-
-
-if is_mindspore_available():
-    import mindspore
-    from mindnlp.core import nn, ops
-    import mindnlp.core.nn.functional as F
-    from mindnlp.core.serialization import safe_save_file, safe_load_file
-
-    from mindnlp.transformers import MODEL_MAPPING#, AdaptiveEmbedding
-    from mindnlp.transformers.modeling_utils import load_state_dict, no_init_weights
-
-
-def _config_zero_init(config):
-    configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
-        if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key:
-            setattr(configs_no_init, key, 1e-10)
-        if isinstance(getattr(configs_no_init, key, None), PretrainedConfig):
-            no_init_subconfig = _config_zero_init(getattr(configs_no_init, key))
-            setattr(configs_no_init, key, no_init_subconfig)
-    return configs_no_init
-
-
-def _mock_init_weights(self, module):
-    for name, param in module.named_parameters(recurse=False):
-        # Use the first letter of the name to get a value and go from a <> -13 to z <> 12
-        value = ord(name[0].lower()) - 110
-        param.assign_value(ops.full(param.shape, value, dtype=param.dtype))
-
-
-def _mock_all_init_weights(self):
-    # Prune heads if needed
-    if self.config.pruned_heads:
-        self.prune_heads(self.config.pruned_heads)
-
-    import mindnlp.transformers.modeling_utils
-
-    if mindnlp.transformers.modeling_utils._init_weights:
-        for module in self.modules():
-            module._is_initialized = False
-        # Initialize weights
-        self.apply(self._initialize_weights)
-
-        # Tie weights should be skipped when not initializing all weights
-        # since from_pretrained(...) calls tie weights anyways
-        self.tie_weights()
-
-
-@require_mindspore
-class ModelTesterMixin:
-    model_tester = None
-    all_model_classes = ()
-    all_generative_model_classes = ()
-    fx_compatible = False
-    test_pruning = True
-    test_resize_embeddings = True
-    test_resize_position_embeddings = False
-    test_head_masking = True
-    test_mismatched_shapes = True
-    test_missing_keys = True
-    test_model_parallel = False
-    is_encoder_decoder = False
-    has_attentions = True
-    model_split_percents = [0.5, 0.7, 0.9]
-
-    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-        inputs_dict = copy.deepcopy(inputs_dict)
-        if model_class.__name__ in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES):
-            inputs_dict = {
-                k: v.unsqueeze(1).broadcast_to((-1, self.model_tester.num_choices, -1))
-                if isinstance(v, mindspore.Tensor) and v.ndim > 1
-                else v
-                for k, v in inputs_dict.items()
-            }
-        elif model_class.__name__ in get_values(MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES):
-            inputs_dict.pop("attention_mask")
-        elif model_class.__name__ == MODEL_FOR_PRETRAINING_MAPPING_NAMES["hiera"]:
-            config = self.model_tester.get_config()
-            mask_spatial_shape = [
-                i // s // ms for i, s, ms in zip(config.image_size, config.patch_stride, config.masked_unit_size)
-            ]
-            num_windows = math.prod(mask_spatial_shape)
-            set_seed(123)
-            inputs_dict["noise"] = ops.rand(self.model_tester.batch_size, num_windows)
-
-        if return_labels:
-            if model_class.__name__ in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES):
-                inputs_dict["labels"] = ops.ones(self.model_tester.batch_size, dtype=mindspore.int64)
-            elif model_class.__name__ in [
-                *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES),
-                *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES),
-            ]:
-                inputs_dict["start_positions"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-                inputs_dict["end_positions"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-            elif model_class.__name__ in [
-                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES),
-            ]:
-                inputs_dict["labels"] = ops.zeros(
-                    self.model_tester.batch_size, dtype=mindspore.int64
-                )
-            elif model_class.__name__ in [
-                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
-                *get_values(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES),
-                *get_values(MODEL_FOR_MASKED_LM_MAPPING_NAMES),
-                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES),
-                *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES),
-            ]:
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=mindspore.int64
-                )
-            elif model_class.__name__ in get_values(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES):
-                num_patches = self.model_tester.image_size // self.model_tester.patch_size
-                inputs_dict["bool_masked_pos"] = ops.zeros(
-                    (self.model_tester.batch_size, num_patches**2), dtype=mindspore.int64
-                )
-            elif model_class.__name__ in get_values(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES):
-                batch_size, num_channels, height, width = inputs_dict["pixel_values"].shape
-                inputs_dict["labels"] = ops.zeros(
-                    (self.model_tester.batch_size, height, width)
-                ).long()
-
-        return inputs_dict
-
-    def test_save_load(self):
-        set_seed(123)
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_save_load(out1, out2):
-            # make sure we don't have nans
-            out_2 = out2.asnumpy()
-            out_2[np.isnan(out_2)] = 0
-            out_2 = out_2[~np.isneginf(out_2)]
-
-            out_1 = out1.asnumpy()
-            out_1[np.isnan(out_1)] = 0
-            out_1 = out_1[~np.isneginf(out_1)]
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                # the config file (and the generation config file, if it can generate) should be saved
-                self.assertTrue(os.path.exists(os.path.join(tmpdirname, CONFIG_NAME)))
-                self.assertEqual(
-                    model.can_generate(), os.path.exists(os.path.join(tmpdirname, GENERATION_CONFIG_NAME))
-                )
-
-                model = model_class.from_pretrained(tmpdirname)
-                with no_grad():
-                    second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            if isinstance(first, tuple) and isinstance(second, tuple):
-                for tensor1, tensor2 in zip(first, second):
-                    check_save_load(tensor1, tensor2)
-            else:
-                check_save_load(first, second)
-
-    def test_from_pretrained_no_checkpoint(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            state_dict = model.state_dict()
-
-            new_model = model_class.from_pretrained(
-                pretrained_model_name_or_path=None, config=config, state_dict=state_dict
-            )
-            for p1, p2 in zip(model.parameters(), new_model.parameters()):
-                self.assertTrue(ops.equal(p1, p2).all())
-
-    def test_keep_in_fp32_modules(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            if model_class._keep_in_fp32_modules is None:
-                self.skipTest(reason="Model class has no _keep_in_fp32_modules attribute defined")
-
-            model = model_class(config)
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                model = model_class.from_pretrained(tmpdirname, ms_dtype=mindspore.float16)
-
-                for name, param in model.named_parameters():
-                    if any(n in model_class._keep_in_fp32_modules for n in name.split(".")):
-                        self.assertTrue(param.dtype == mindspore.float32)
-                    else:
-                        self.assertTrue(param.dtype == mindspore.float16, name)
-
-    def test_save_load_keys_to_ignore_on_save(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            _keys_to_ignore_on_save = getattr(model, "_keys_to_ignore_on_save", None)
-            if _keys_to_ignore_on_save is None:
-                continue
-
-            # check the keys are in the original state_dict
-            for k in _keys_to_ignore_on_save:
-                self.assertIn(k, model.state_dict().keys(), "\n".join(model.state_dict().keys()))
-
-            # check that certain keys didn't get saved with the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                output_model_file = os.path.join(tmpdirname, SAFE_WEIGHTS_NAME)
-                state_dict_saved = safe_load_file(output_model_file)
-
-                for k in _keys_to_ignore_on_save:
-                    self.assertNotIn(k, state_dict_saved.keys(), "\n".join(state_dict_saved.keys()))
-
-                # Test we can load the state dict in the model, necessary for the checkpointing API in Trainer.
-                load_result = model.load_state_dict(state_dict_saved, strict=False)
-                keys_to_ignore = set(model._keys_to_ignore_on_save)
-
-                if hasattr(model, "_tied_weights_keys"):
-                    keys_to_ignore.update(set(model._tied_weights_keys))
-
-                self.assertTrue(len(load_result.missing_keys) == 0 or set(load_result.missing_keys) == keys_to_ignore)
-                self.assertTrue(len(load_result.unexpected_keys) == 0)
-
-    def test_gradient_checkpointing_backward_compatibility(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if not model_class.supports_gradient_checkpointing:
-                continue
-
-            config.gradient_checkpointing = True
-            model = model_class(config)
-            self.assertTrue(model.is_gradient_checkpointing)
-
-    def test_gradient_checkpointing_enable_disable(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if not model_class.supports_gradient_checkpointing:
-                continue
-
-            # at init model should have gradient checkpointing disabled
-            model = model_class(config)
-            self.assertFalse(model.is_gradient_checkpointing)
-
-            # check enable works
-            model.gradient_checkpointing_enable()
-            self.assertTrue(model.is_gradient_checkpointing)
-
-            # Loop over all modules and check that relevant modules have gradient_checkpointing set to True
-            for n, m in model.named_modules():
-                if hasattr(m, "gradient_checkpointing"):
-                    self.assertTrue(
-                        m.gradient_checkpointing, f"Module {n} does not have gradient_checkpointing set to True"
-                    )
-
-            # check disable works
-            model.gradient_checkpointing_disable()
-            self.assertFalse(model.is_gradient_checkpointing)
-
-            # Loop over all modules and check that relevant modules have gradient_checkpointing set to False
-            for n, m in model.named_modules():
-                if hasattr(m, "gradient_checkpointing"):
-                    self.assertFalse(
-                        m.gradient_checkpointing, f"Module {n} does not have gradient_checkpointing set to False"
-                    )
-
-    @is_flaky(description="low likelihood of failure, reason not yet discovered")
-    def test_save_load_fast_init_from_base(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if config.__class__ not in MODEL_MAPPING:
-            self.skipTest(reason="Model class not in MODEL_MAPPING")
-
-        base_class = MODEL_MAPPING[config.__class__]
-
-        if isinstance(base_class, tuple):
-            base_class = base_class[0]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            # make a copy of model class to not break future tests
-            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
-            class CopyClass(model_class):
-                pass
-
-            model_class_copy = CopyClass
-
-            # make sure that all keys are expected for test
-            model_class_copy._keys_to_ignore_on_load_missing = []
-
-            # make init deterministic, but make sure that
-            # non-initialized weights throw errors nevertheless
-            model_class_copy._init_weights = _mock_init_weights
-            model_class_copy.init_weights = _mock_all_init_weights
-
-            model = base_class(config)
-            state_dict = model.state_dict()
-
-            # this will often delete a single weight of a multi-weight module
-            # to test an edge case
-            random_key_to_del = random.choice(list(state_dict.keys()))
-            del state_dict[random_key_to_del]
-
-            # check that certain keys didn't get saved with the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                save_checkpoint(state_dict, os.path.join(tmpdirname, "mindspore_model.ckpt"))
-
-                model_fast_init = model_class_copy.from_pretrained(tmpdirname)
-                model_slow_init = model_class_copy.from_pretrained(tmpdirname, _fast_init=False)
-                # Before we test anything
-
-                for key in model_fast_init.state_dict().keys():
-                    # if isinstance(model_slow_init.state_dict()[key], mindspore.Tensor):
-                    #     max_diff = (model_slow_init.state_dict()[key] ^ model_fast_init.state_dict()[key]).sum().item()
-                    # else:
-                    max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # @slow
-    # @require_accelerate
-    # @mark.accelerate_tests
-    # def test_save_load_low_cpu_mem_usage(self):
-    #     config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    #     with tempfile.TemporaryDirectory() as saved_model_path:
-    #         for model_class in self.all_model_classes:
-    #             model_to_save = model_class(config)
-    #             model_to_save.save_pretrained(saved_model_path)
-
-    #             self._check_save_load_low_cpu_mem_usage(model_class, saved_model_path)
-
-    # @slow
-    # @require_accelerate
-    # @mark.accelerate_tests
-    # def test_save_load_low_cpu_mem_usage_checkpoints(self):
-    #     config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    #     with tempfile.TemporaryDirectory() as saved_model_path:
-    #         for model_class in self.all_model_classes:
-    #             model_to_save = model_class(config)
-    #             model_to_save.config.save_pretrained(saved_model_path)
-    #             torch.save(model_to_save.state_dict(), os.path.join(saved_model_path, "mindspore_model.ckpt"))
-
-    #             self._check_save_load_low_cpu_mem_usage(model_class, saved_model_path)
-
-    # @slow
-    # @require_accelerate
-    # @mark.accelerate_tests
-    # def test_save_load_low_cpu_mem_usage_no_safetensors(self):
-    #     with tempfile.TemporaryDirectory() as saved_model_path:
-    #         for model_class in self.all_model_classes:
-    #             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    #             model_to_save = model_class(config)
-
-    #             model_to_save.save_pretrained(saved_model_path, safe_serialization=False)
-    #             self._check_save_load_low_cpu_mem_usage(model_class, saved_model_path)
-
-    # def _check_save_load_low_cpu_mem_usage(self, model_class, saved_model_path):
-    #     from accelerate.utils.modeling import named_module_tensors
-
-    #     # Load the low usage and the normal models.
-    #     model_low_usage, loading_info = model_class.from_pretrained(
-    #         saved_model_path,
-    #         low_cpu_mem_usage=True,
-    #         output_loading_info=True,
-    #     )
-    #     model_non_low_usage = model_class.from_pretrained(saved_model_path)
-
-    #     # Check that there were no missing keys.
-    #     self.assertEqual(loading_info["missing_keys"], [])
-
-    #     # The low_cpu_mem_usage=True causes the model params to be initialized with device=meta, and then
-    #     # subsequently loaded with the correct values and onto the correct device. We check if there are any
-    #     # remaining params that were not properly loaded.
-    #     for name, tensor in named_module_tensors(model_low_usage, recurse=True):
-    #         self.assertNotEqual(
-    #             tensor.device,
-    #             torch.device("meta"),
-    #             "Tensor '" + name + "' has not been properly loaded and has device=meta.",
-    #         )
-
-    #     # Check that the parameters are equal.
-    #     for p1, p2 in zip(model_low_usage.parameters(), model_non_low_usage.parameters()):
-    #         self.assertEqual(p1.ne(p2).sum(), 0)
-
-    #     # Check that the state dict keys are equal.
-    #     self.assertEqual(set(model_low_usage.state_dict().keys()), set(model_non_low_usage.state_dict().keys()))
-
-    #     # Check that the shared tensors are equal.
-    #     tensor_ptrs1 = collections.defaultdict(list)
-    #     for name, tensor in model_low_usage.state_dict().items():
-    #         tensor_ptrs1[id(tensor)].append(name)
-    #     tied_params1 = [names for _, names in tensor_ptrs1.items() if len(names) > 1]
-
-    #     tensor_ptrs2 = collections.defaultdict(list)
-    #     for name, tensor in model_non_low_usage.state_dict().items():
-    #         tensor_ptrs2[id(tensor)].append(name)
-    #     tied_params2 = [names for _, names in tensor_ptrs2.items() if len(names) > 1]
-
-    #     self.assertEqual(tied_params1, tied_params2)
-
-    def test_save_load_fast_init_to_base(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if config.__class__ not in MODEL_MAPPING:
-            self.skipTest(reason="Model class not in MODEL_MAPPING")
-
-        base_class = MODEL_MAPPING[config.__class__]
-
-        if isinstance(base_class, tuple):
-            base_class = base_class[0]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            # make a copy of model class to not break future tests
-            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
-            class CopyClass(base_class):
-                pass
-
-            base_class_copy = CopyClass
-
-            # make sure that all keys are expected for test
-            base_class_copy._keys_to_ignore_on_load_missing = []
-
-            # make init deterministic, but make sure that
-            # non-initialized weights throw errors nevertheless
-            base_class_copy._init_weights = _mock_init_weights
-            base_class_copy.init_weights = _mock_all_init_weights
-
-            model = model_class(config)
-            state_dict = model.state_dict()
-
-            # this will often delete a single weight of a multi-weight module
-            # to test an edge case
-            random_key_to_del = random.choice(list(state_dict.keys()))
-            del state_dict[random_key_to_del]
-
-            # check that certain keys didn't get saved with the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.config.save_pretrained(tmpdirname)
-                save_checkpoint(state_dict, os.path.join(tmpdirname, "mindspore_model.ckpt"))
-
-                model_fast_init = base_class_copy.from_pretrained(tmpdirname)
-                model_slow_init = base_class_copy.from_pretrained(tmpdirname, _fast_init=False)
-
-                for key in model_fast_init.state_dict().keys():
-                    # if isinstance(model_slow_init.state_dict()[key], mindspore.Tensor):
-                    #     max_diff = ops.max(
-                    #         model_slow_init.state_dict()[key] ^ model_fast_init.state_dict()[key]
-                    #     ).item()
-                    # else:
-                    max_diff = ops.max(
-                        ops.abs(model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key])
-                    ).item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    def test_mindspore_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if config.__class__ not in MODEL_MAPPING:
-            self.skipTest(reason="Model class not in MODEL_MAPPING")
-
-        base_class = MODEL_MAPPING[config.__class__]
-
-        if isinstance(base_class, tuple):
-            base_class = base_class[0]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            # make a copy of model class to not break future tests
-            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
-            class CopyClass(base_class):
-                pass
-
-            base_class_copy = CopyClass
-
-            # make sure that all keys are expected for test
-            base_class_copy._keys_to_ignore_on_load_missing = []
-
-            # make init deterministic, but make sure that
-            # non-initialized weights throw errors nevertheless
-            base_class_copy._init_weights = _mock_init_weights
-            base_class_copy.init_weights = _mock_all_init_weights
-
-            model = model_class(config)
-            state_dict = model.state_dict()
-
-            def check_equal(loaded):
-                for key in state_dict.keys():
-                    if state_dict[key].dtype == mindspore.bool_:
-                        continue
-                    max_diff = ops.max(ops.abs(state_dict[key] - loaded[key])
-                    ).item()
-                    self.assertLessEqual(max_diff, 1e-6, msg=f"{key} not identical")
-
-            # check that certain keys didn't get saved with the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_checkpoint_path = os.path.join(tmpdirname, "mindspore_model.ckpt")
-                save_checkpoint(state_dict, pt_checkpoint_path)
-                check_equal(load_state_dict(pt_checkpoint_path))
-                save_checkpoint(state_dict, pt_checkpoint_path)
-                check_equal(load_state_dict(pt_checkpoint_path))
-
-    def test_initialization(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            for name, param in model.named_parameters():
-                if param.requires_grad:
-                    self.assertIn(
-                        ((param.mean() * 1e9).round() / 1e9).item(),
-                        [0.0, 1.0],
-                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                    )
-
-    def test_determinism(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_determinism(first, second):
-            out_1 = first.asnumpy()
-            out_2 = second.asnumpy()
-            out_1 = out_1[~np.isnan(out_1)]
-            out_2 = out_2[~np.isnan(out_2)]
-            out_1 = out_1[~np.isneginf(out_1)]
-            out_2 = out_2[~np.isneginf(out_2)]
-            max_diff = np.amax(np.abs(out_1 - out_2))
-            self.assertLessEqual(max_diff, 1e-5)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-                second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            if isinstance(first, tuple) and isinstance(second, tuple):
-                for tensor1, tensor2 in zip(first, second):
-                    check_determinism(tensor1, tensor2)
-            else:
-                check_determinism(first, second)
-
-    def test_batching_equivalence(self):
-        """
-        Tests that the model supports batching and that the output is the nearly the same for the same input in
-        different batch sizes.
-        (Why "nearly the same" not "exactly the same"? Batching uses different matmul shapes, which often leads to
-        different results: https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535)
-        """
-
-        def get_tensor_equivalence_function(batched_input):
-            # models operating on continuous spaces have higher abs difference than LMs
-            # instead, we can rely on cos distance for image/speech models, similar to `diffusers`
-            if "input_ids" not in batched_input:
-                return lambda tensor1, tensor2: (
-                    1.0 - F.cosine_similarity(tensor1.float().flatten(), tensor2.float().flatten(), dim=0, eps=1e-38)
-                )
-            return lambda tensor1, tensor2: ops.max(ops.abs(tensor1 - tensor2))
-
-        def recursive_check(batched_object, single_row_object, model_name, key):
-            if isinstance(batched_object, (list, tuple)):
-                for batched_object_value, single_row_object_value in zip(batched_object, single_row_object):
-                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
-            elif isinstance(batched_object, dict):
-                for batched_object_value, single_row_object_value in zip(
-                    batched_object.values(), single_row_object.values()
-                ):
-                    recursive_check(batched_object_value, single_row_object_value, model_name, key)
-            # do not compare returned loss (0-dim tensor) / codebook ids (int) / caching objects
-            elif batched_object is None or not isinstance(batched_object, mindspore.Tensor):
-                return
-            elif batched_object.ndim == 0:
-                return
-            else:
-                if isinstance(batched_object.dtype, mindspore.dtype.Int):
-                    return
-                # indexing the first element does not always work
-                # e.g. models that output similarity scores of size (N, M) would need to index [0, 0]
-                slice_ids = [slice(0, index) for index in single_row_object.shape]
-                batched_row = batched_object[slice_ids]
-                self.assertFalse(
-                    ops.isnan(batched_row).any(), f"Batched output has `nan` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isinf(batched_row).any(), f"Batched output has `inf` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isnan(single_row_object).any(), f"Single row output has `nan` in {model_name} for key={key}"
-                )
-                self.assertFalse(
-                    ops.isinf(single_row_object).any(), f"Single row output has `inf` in {model_name} for key={key}"
-                )
-                self.assertTrue(
-                    (equivalence(batched_row, single_row_object)) <= 1e-03,
-                    msg=(
-                        f"Batched and Single row outputs are not equal in {model_name} for key={key}. "
-                        f"Difference={equivalence(batched_row, single_row_object)}."
-                    ),
-                )
-
-        config, batched_input = self.model_tester.prepare_config_and_inputs_for_common()
-        equivalence = get_tensor_equivalence_function(batched_input)
-
-        for model_class in self.all_model_classes:
-            config.output_hidden_states = True
-
-            model_name = model_class.__name__
-            if hasattr(self.model_tester, "prepare_config_and_inputs_for_model_class"):
-                config, batched_input = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
-
-            batched_input_prepared = self._prepare_for_class(batched_input, model_class)
-            model = model_class(config).eval()
-
-            batch_size = self.model_tester.batch_size
-            single_row_input = {}
-            for key, value in batched_input_prepared.items():
-                if isinstance(value, mindspore.Tensor) and value.shape[0] % batch_size == 0:
-                    # e.g. musicgen has inputs of size (bs*codebooks). in most cases value.shape[0] == batch_size
-                    single_batch_shape = value.shape[0] // batch_size
-                    single_row_input[key] = value[:single_batch_shape]
-                else:
-                    single_row_input[key] = value
-
-            with no_grad():
-                model_batched_output = model(**batched_input_prepared)
-                model_row_output = model(**single_row_input)
-
-            if isinstance(model_batched_output, mindspore.Tensor):
-                model_batched_output = {"model_output": model_batched_output}
-                model_row_output = {"model_output": model_row_output}
-
-            for key in model_batched_output:
-                # DETR starts from zero-init queries to decoder, leading to cos_similarity = `nan`
-                if hasattr(self, "zero_init_hidden_state") and "decoder_hidden_states" in key:
-                    model_batched_output[key] = model_batched_output[key][1:]
-                    model_row_output[key] = model_row_output[key][1:]
-
-                recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
-
-    def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="ModelTester is not configured to run training tests")
-
-        for model_class in self.all_model_classes:
-            if (
-                model_class.__name__
-                in [
-                    *get_values(MODEL_MAPPING_NAMES),
-                    *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES),
-                ]
-                or not model_class.supports_gradient_checkpointing
-            ):
-                continue
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.use_cache = False
-            config.return_dict = True
-            model = model_class(config)
-
-            model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
-            model.train()
-
-            # unfreeze additional layers
-            for p in model.parameters():
-                p.requires_grad = True
-
-            optimizer = optim.SGD(model.parameters(), lr=0.01)
-
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            def forward(**inputs):
-                loss = model(**inputs).loss
-                return loss
-            grad_fn = value_and_grad(forward, tuple(model.parameters()), attach_grads=True)
-            loss = grad_fn(**inputs)
-            optimizer.step()
-
-    def test_training(self):
-        if not self.model_tester.is_training:
-            self.skipTest(reason="ModelTester is not configured to run training tests")
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.return_dict = True
-
-            if model_class.__name__ in [
-                *get_values(MODEL_MAPPING_NAMES),
-                *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES),
-            ]:
-                continue
-
-            model = model_class(config)
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            def forward(**inputs):
-                return model(**inputs).loss
-            
-            grad_fn = value_and_grad(forward, tuple(model.parameters()))
-            loss = grad_fn(**inputs)
-
-    @unittest.skip
-    def test_training_gradient_checkpointing(self):
-        # Scenario - 1 default behaviour
-        self.check_training_gradient_checkpointing()
-
-    @unittest.skip
-    def test_training_gradient_checkpointing_use_reentrant(self):
-        # Scenario - 2 with `use_reentrant=True` - this is the default value that is used in pytorch's
-        # torch.utils.checkpoint.checkpoint
-        self.check_training_gradient_checkpointing(gradient_checkpointing_kwargs={"use_reentrant": True})
-
-    @unittest.skip
-    def test_training_gradient_checkpointing_use_reentrant_false(self):
-        # Scenario - 3 with `use_reentrant=False` pytorch suggests users to use this value for
-        # future releases: https://pytorch.org/docs/stable/checkpoint.html
-        self.check_training_gradient_checkpointing(gradient_checkpointing_kwargs={"use_reentrant": False})
-
-    def test_attention_outputs(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model does not output attentions")
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-            out_len = len(outputs)
-
-            if self.is_encoder_decoder:
-                correct_outlen = 5
-
-                # loss is at first position
-                if "labels" in inputs_dict:
-                    correct_outlen += 1  # loss is added to beginning
-                # Question Answering model returns start_logits and end_logits
-                if model_class.__name__ in [
-                    *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES),
-                    *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES),
-                ]:
-                    correct_outlen += 1  # start_logits and end_logits instead of only 1 output
-                if "past_key_values" in outputs:
-                    correct_outlen += 1  # past_key_values have been returned
-
-                self.assertEqual(out_len, correct_outlen)
-
-                # decoder attentions
-                decoder_attentions = outputs.decoder_attentions
-                self.assertIsInstance(decoder_attentions, (list, tuple))
-                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(decoder_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
-                )
-
-                # cross attentions
-                cross_attentions = outputs.cross_attentions
-                self.assertIsInstance(cross_attentions, (list, tuple))
-                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
-                self.assertListEqual(
-                    list(cross_attentions[0].shape[-3:]),
-                    [
-                        self.model_tester.num_attention_heads,
-                        decoder_seq_length,
-                        encoder_key_length,
-                    ],
-                )
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.eval()
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-    def test_headmasking(self):
-        if not self.test_head_masking:
-            self.skipTest(reason="Model does not support head masking")
-
-        global_rng.seed(42)
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        global_rng.seed()
-
-        inputs_dict["output_attentions"] = True
-        config.output_hidden_states = True
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.eval()
-
-            # Prepare head_mask
-            # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
-            head_mask = ops.ones(
-                self.model_tester.num_hidden_layers,
-                self.model_tester.num_attention_heads,
-            )
-            head_mask[0, 0] = 0
-            head_mask[-1, :-1] = 0
-            head_mask.requires_grad =True
-            inputs = self._prepare_for_class(inputs_dict, model_class).copy()
-            inputs["head_mask"] = head_mask
-            if model.config.is_encoder_decoder:
-                signature = inspect.signature(model.forward)
-                arg_names = [*signature.parameters.keys()]
-                if "decoder_head_mask" in arg_names:  # necessary diferentiation because of T5 model
-                    inputs["decoder_head_mask"] = head_mask
-                if "cross_attn_head_mask" in arg_names:
-                    inputs["cross_attn_head_mask"] = head_mask
-            outputs = model(**inputs, return_dict=True)
-
-            # Test that we can get a gradient back for importance score computation
-            output = sum(t.sum() for t in outputs[0])
-            output = output.sum()
-            # output.backward()
-            # multihead_outputs = head_mask.grad
-
-            # self.assertIsNotNone(multihead_outputs)
-            # self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
-
-            # def check_attentions_validity(attentions):
-            #     # Remove Nan
-            #     for t in attentions:
-            #         self.assertLess(
-            #             ops.sum(ops.isnan(t)), t.numel() / 4
-            #         )  # Check we don't have more than 25% nans (arbitrary)
-            #     attentions = [
-            #         t.masked_fill(ops.isnan(t), 0.0) for t in attentions
-            #     ]  # remove them (the test is less complete)
-
-            #     self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
-            #     self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
-            #     if len(attentions) > 2:  # encoder-decoder models have only 2 layers in each module
-            #         self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-            #     self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
-            #     self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
-
-            # if model.config.is_encoder_decoder:
-            #     check_attentions_validity(outputs.encoder_attentions)
-            #     check_attentions_validity(outputs.decoder_attentions)
-            #     check_attentions_validity(outputs.cross_attentions)
-            # else:
-            #     check_attentions_validity(outputs.attentions)
-
-    def test_head_pruning(self):
-        if not self.test_pruning:
-            self.skipTest(reason="Pruning is not activated")
-
-        for model_class in self.all_model_classes:
-            (
-                config,
-                inputs_dict,
-            ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
-            model = model_class(config=config)
-            model.eval()
-            heads_to_prune = {
-                0: list(range(1, self.model_tester.num_attention_heads)),
-                -1: [0],
-            }
-            model.prune_heads(heads_to_prune)
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], 1)
-            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
-            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-
-    def test_head_pruning_save_load_from_pretrained(self):
-        if not self.test_pruning:
-            self.skipTest(reason="Pruning is not activated")
-
-        for model_class in self.all_model_classes:
-            (
-                config,
-                inputs_dict,
-            ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
-            model = model_class(config=config)
-            model.eval()
-            heads_to_prune = {
-                0: list(range(1, self.model_tester.num_attention_heads)),
-                -1: [0],
-            }
-            model.prune_heads(heads_to_prune)
-
-            with tempfile.TemporaryDirectory() as temp_dir_name:
-                model.save_pretrained(temp_dir_name)
-                model = model_class.from_pretrained(temp_dir_name)
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-            self.assertEqual(attentions[0].shape[-3], 1)
-            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
-            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-
-    def test_head_pruning_save_load_from_config_init(self):
-        if not self.test_pruning:
-            self.skipTest(reason="Pruning is not activated")
-
-        for model_class in self.all_model_classes:
-            (
-                config,
-                inputs_dict,
-            ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
-
-            heads_to_prune = {
-                0: list(range(1, self.model_tester.num_attention_heads)),
-                -1: [0],
-            }
-            config.pruned_heads = heads_to_prune
-
-            model = model_class(config=config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], 1)
-            # TODO: To have this check, we will need at least 3 layers. Do we really need it?
-            # self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
-            self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
-
-    def test_head_pruning_integration(self):
-        if not self.test_pruning:
-            self.skipTest(reason="Pruning is not activated")
-
-        for model_class in self.all_model_classes:
-            (
-                config,
-                inputs_dict,
-            ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-            if "head_mask" in inputs_dict:
-                del inputs_dict["head_mask"]
-
-            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
-
-            heads_to_prune = {1: [1, 2]}
-            config.pruned_heads = heads_to_prune
-
-            model = model_class(config=config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-
-            with tempfile.TemporaryDirectory() as temp_dir_name:
-                model.save_pretrained(temp_dir_name)
-                model = model_class.from_pretrained(temp_dir_name)
-    
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 0)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-
-            heads_to_prune = {0: [0], 1: [1, 2]}
-            model.prune_heads(heads_to_prune)
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs[-1]
-
-            self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
-            self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
-
-            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2]})
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.eval()
-
-            with no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
-                    seq_length = seq_length * self.model_tester.chunk_length
-            else:
-                seq_length = self.model_tester.seq_length
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    # def test_retain_grad_hidden_states_attentions(self):
-    #     config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    #     config.output_hidden_states = True
-    #     config.output_attentions = self.has_attentions
-
-    #     # no need to test all models as different heads yield the same functionality
-    #     model_class = self.all_model_classes[0]
-    #     model = model_class(config)
-
-    #     inputs = self._prepare_for_class(inputs_dict, model_class)
-
-    #     outputs = model(**inputs)
-
-    #     output = outputs[0]
-
-    #     if config.is_encoder_decoder:
-    #         # Seq2Seq models
-    #         encoder_hidden_states = outputs.encoder_hidden_states[0]
-    #         encoder_hidden_states.retain_grad()
-
-    #         decoder_hidden_states = outputs.decoder_hidden_states[0]
-    #         decoder_hidden_states.retain_grad()
-
-    #         if self.has_attentions:
-    #             encoder_attentions = outputs.encoder_attentions[0]
-    #             encoder_attentions.retain_grad()
-
-    #             decoder_attentions = outputs.decoder_attentions[0]
-    #             decoder_attentions.retain_grad()
-
-    #             cross_attentions = outputs.cross_attentions[0]
-    #             cross_attentions.retain_grad()
-
-    #         output.flatten()[0].backward(retain_graph=True)
-
-    #         self.assertIsNotNone(encoder_hidden_states.grad)
-    #         self.assertIsNotNone(decoder_hidden_states.grad)
-
-    #         if self.has_attentions:
-    #             self.assertIsNotNone(encoder_attentions.grad)
-    #             self.assertIsNotNone(decoder_attentions.grad)
-    #             self.assertIsNotNone(cross_attentions.grad)
-    #     else:
-    #         # Encoder-/Decoder-only models
-    #         hidden_states = outputs.hidden_states[0]
-    #         hidden_states.retain_grad()
-
-    #         if self.has_attentions:
-    #             attentions = outputs.attentions[0]
-    #             attentions.retain_grad()
-
-    #         output.flatten()[0].backward(retain_graph=True)
-
-    #         self.assertIsNotNone(hidden_states.grad)
-
-    #         if self.has_attentions:
-    #             self.assertIsNotNone(attentions.grad)
-
-    def test_feed_forward_chunking(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            set_seed(0)
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-            model.eval()
-
-            hidden_states_no_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-
-            set_seed(0)
-            config.chunk_size_feed_forward = 1
-            model = model_class(config)
-            model.eval()
-            hidden_states_with_chunk = model(**self._prepare_for_class(inputs_dict, model_class))[0]
-            self.assertTrue(ops.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3))
-
-    def test_resize_position_vector_embeddings(self):
-        if not self.test_resize_position_embeddings:
-            self.skipTest(reason="Model does not have position embeddings")
-
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            if self.model_tester.is_training is False:
-                model.eval()
-
-            max_position_embeddings = config.max_position_embeddings
-
-            # Retrieve the embeddings and clone theme
-            if model.config.is_encoder_decoder:
-                encoder_model_embed, decoder_model_embed = model.get_position_embeddings()
-                encoder_cloned_embeddings = encoder_model_embed.weight.clone()
-                decoder_cloned_embeddings = decoder_model_embed.weight.clone()
-            else:
-                model_embed = model.get_position_embeddings()
-                cloned_embeddings = model_embed.weight.clone()
-
-            # Check that resizing the position embeddings with a larger max_position_embeddings increases
-            # the model's postion embeddings size
-            model.resize_position_embeddings(max_position_embeddings + 10)
-            self.assertEqual(model.config.max_position_embeddings, max_position_embeddings + 10)
-
-            # Check that it actually resizes the embeddings matrix
-            if model.config.is_encoder_decoder:
-                encoder_model_embed, decoder_model_embed = model.get_position_embeddings()
-                self.assertEqual(encoder_model_embed.weight.shape[0], encoder_cloned_embeddings.shape[0] + 10)
-                self.assertEqual(decoder_model_embed.weight.shape[0], decoder_cloned_embeddings.shape[0] + 10)
-            else:
-                model_embed = model.get_position_embeddings()
-                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the position embeddings with a smaller max_position_embeddings decreases
-            # the model's max_position_embeddings
-            model.resize_position_embeddings(max_position_embeddings - 5)
-            self.assertEqual(model.config.max_position_embeddings, max_position_embeddings - 5)
-
-            # Check that it actually resizes the embeddings matrix
-            if model.config.is_encoder_decoder:
-                encoder_model_embed, decoder_model_embed = model.get_position_embeddings()
-                self.assertEqual(encoder_model_embed.weight.shape[0], encoder_cloned_embeddings.shape[0] - 5)
-                self.assertEqual(decoder_model_embed.weight.shape[0], decoder_cloned_embeddings.shape[0] - 5)
-            else:
-                model_embed = model.get_position_embeddings()
-                self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 5)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-
-            if model.config.is_encoder_decoder:
-                for p1, p2 in zip(encoder_cloned_embeddings, encoder_model_embed.weight):
-                    if p1.ne(p2).sum() > 0:
-                        models_equal = False
-                for p1, p2 in zip(decoder_cloned_embeddings, decoder_model_embed.weight):
-                    if p1.ne(p2).sum() > 0:
-                        models_equal = False
-            else:
-                for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                    if p1.ne(p2).sum() > 0:
-                        models_equal = False
-
-            self.assertTrue(models_equal)
-
-    def test_resize_tokens_embeddings(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            self.skipTest(reason="test_resize_embeddings is set to `False`")
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-            model_embed_pre_resize = model.get_input_embeddings()
-            type_model_embed_pre_resize = type(model_embed_pre_resize)
-
-            if self.model_tester.is_training is False:
-                model.eval()
-
-            model_vocab_size = config.text_config.vocab_size if hasattr(config, "text_config") else config.vocab_size
-            # Retrieve the embeddings and clone theme
-            model_embed = model.resize_token_embeddings(model_vocab_size)
-            cloned_embeddings = model_embed.weight.clone()
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
-            new_model_vocab_size = (
-                model.config.text_config.vocab_size
-                if hasattr(model.config, "text_config")
-                else model.config.vocab_size
-            )
-            self.assertEqual(new_model_vocab_size, model_vocab_size + 10)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
-            # Check to make sure the type of embeddings returned post resizing is same as type of input
-            type_model_embed_post_resize = type(model_embed)
-            self.assertEqual(type_model_embed_pre_resize, type_model_embed_post_resize)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
-            new_model_vocab_size = (
-                model.config.text_config.vocab_size
-                if hasattr(model.config, "text_config")
-                else model.config.vocab_size
-            )
-            self.assertEqual(new_model_vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
-
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["input_ids"] = inputs_dict["input_ids"].clamp(max=model_vocab_size - 15 - 1)
-
-            # make sure that decoder_input_ids are resized as well
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = inputs_dict["decoder_input_ids"].clamp(max=model_vocab_size - 15 - 1)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-            models_equal = True
-            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
-                if p1.ne(p2).sum() > 0:
-                    models_equal = False
-
-            self.assertTrue(models_equal)
-
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            model_vocab_size = config.text_config.vocab_size if hasattr(config, "text_config") else config.vocab_size
-            model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
-            new_model_vocab_size = (
-                model.config.text_config.vocab_size
-                if hasattr(model.config, "text_config")
-                else model.config.vocab_size
-            )
-            self.assertTrue(new_model_vocab_size + 10, model_vocab_size)
-
-            model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
-            new_model_vocab_size = (
-                model.config.text_config.vocab_size
-                if hasattr(model.config, "text_config")
-                else model.config.vocab_size
-            )
-            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
-
-            self.assertTrue(model_embed.weight.shape[0], new_model_vocab_size)
-            self.assertTrue(new_model_vocab_size, model.vocab_size)
-
-            model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
-            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
-
-            # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
-            target_dimension = 128
-            model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64)
-            self.assertTrue(model_embed.weight.shape[0], target_dimension)
-
-            with self.assertRaisesRegex(
-                ValueError,
-                "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
-            ):
-                model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
-
-    def test_resize_embeddings_untied(self):
-        (
-            original_config,
-            inputs_dict,
-        ) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            self.skipTest(reason="test_resize_embeddings is set to `False`")
-
-        original_config.tie_word_embeddings = False
-
-        # if model cannot untied embeddings -> leave test
-        if original_config.tie_word_embeddings:
-            self.skipTest(reason="Model cannot untied embeddings")
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-
-            # if no output embeddings -> leave test
-            if model.get_output_embeddings() is None:
-                continue
-
-            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-            model_vocab_size = config.text_config.vocab_size if hasattr(config, "text_config") else config.vocab_size
-            model.resize_token_embeddings(model_vocab_size + 10)
-            new_model_vocab_size = (
-                model.config.text_config.vocab_size
-                if hasattr(model.config, "text_config")
-                else model.config.vocab_size
-            )
-            self.assertEqual(new_model_vocab_size, model_vocab_size + 10)
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-            model.resize_token_embeddings(model_vocab_size - 15)
-            new_model_vocab_size = (
-                model.config.text_config.vocab_size
-                if hasattr(model.config, "text_config")
-                else model.config.vocab_size
-            )
-            self.assertEqual(new_model_vocab_size, model_vocab_size - 15)
-            # Check that it actually resizes the embeddings matrix
-            output_embeds = model.get_output_embeddings()
-            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
-            # Check bias if present
-            if output_embeds.bias is not None:
-                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            # Input ids should be clamped to the maximum size of the vocabulary
-            inputs_dict["input_ids"] = inputs_dict["input_ids"].clamp(max=model_vocab_size - 15 - 1)
-            if "decoder_input_ids" in inputs_dict:
-                inputs_dict["decoder_input_ids"] = inputs_dict["decoder_input_ids"].clamp(max=model_vocab_size - 15 - 1)
-            # Check that the model can still do a forward pass successfully (every parameter should be resized)
-            model(**self._prepare_for_class(inputs_dict, model_class))
-
-    def test_model_get_set_embeddings(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (nn.Embedding))
-
-            new_input_embedding_layer = nn.Embedding(10, 10)
-            model.set_input_embeddings(new_input_embedding_layer)
-            self.assertEqual(model.get_input_embeddings(), new_input_embedding_layer)
-
-            x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, nn.Linear))
-
-    def test_model_main_input_name(self):
-        for model_class in self.all_model_classes:
-            model_signature = inspect.signature(getattr(model_class, "forward"))
-            # The main input is the name of the argument after `self`
-            observed_main_input_name = list(model_signature.parameters.keys())[1]
-            self.assertEqual(model_class.main_input_name, observed_main_input_name)
-
-    def test_correct_missing_keys(self):
-        if not self.test_missing_keys:
-            self.skipTest(reason="test_missing_keys is set to `False`")
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            base_model_prefix = model.base_model_prefix
-
-            if hasattr(model, base_model_prefix):
-                extra_params = {k: v for k, v in model.named_parameters() if not k.startswith(base_model_prefix)}
-                extra_params.update({k: v for k, v in model.named_buffers() if not k.startswith(base_model_prefix)})
-                # Some models define this as None
-                if model._keys_to_ignore_on_load_missing:
-                    for key in model._keys_to_ignore_on_load_missing:
-                        extra_params.pop(key, None)
-
-                if not extra_params:
-                    # In that case, we *are* on a head model, but every
-                    # single key is not actual parameters and this is
-                    # tested in `test_tied_model_weights_key_ignore` test.
-                    continue
-
-                with tempfile.TemporaryDirectory() as temp_dir_name:
-                    model.base_model.save_pretrained(temp_dir_name)
-                    model, loading_info = model_class.from_pretrained(temp_dir_name, output_loading_info=True)
-                    self.assertGreater(len(loading_info["missing_keys"]), 0, model.__class__.__name__)
-
-    def test_tie_model_weights(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def check_same_values(layer_1, layer_2):
-            equal = True
-            for p1, p2 in zip(layer_1.weight, layer_2.weight):
-                if p1.ne(p2).sum() > 0:
-                    equal = False
-            return equal
-
-        for model_class in self.all_model_classes:
-            model_not_tied = model_class(config)
-            if model_not_tied.get_output_embeddings() is None:
-                continue
-
-            config_tied = copy.deepcopy(config)
-            model_tied = model_class(config_tied)
-            params_tied = list(model_tied.parameters())
-            # Check that the embedding layer and decoding layer are the same in size and in value
-            # self.assertTrue(check_same_values(embeddings, decoding))
-
-            # Check that after resize they remain tied.
-            vocab_size = config.text_config.vocab_size if hasattr(config, "text_config") else config.vocab_size
-            model_tied.resize_token_embeddings(vocab_size + 10)
-            params_tied_2 = list(model_tied.parameters())
-            self.assertEqual(len(params_tied_2), len(params_tied))
-
-    @require_safetensors
-    def test_can_use_safetensors(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model_tied = model_class(config)
-            with tempfile.TemporaryDirectory() as d:
-                try:
-                    model_tied.save_pretrained(d, safe_serialization=True)
-                except Exception as e:
-                    raise Exception(f"Class {model_class.__name__} cannot be saved using safetensors: {e}")
-
-                model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True)
-                # Checking the state dicts are correct
-                reloaded_state = model_reloaded.state_dict()
-                for k, v in model_tied.state_dict().items():
-                    self.assertIn(k, reloaded_state, f"Key {k} is missing from reloaded")
-                    assert ops.allclose(v, reloaded_state[k]), lambda x: f"{model_class.__name__}: Tensor {k}: {x}"
-                # Checking there was no complain of missing weights
-                self.assertEqual(infos["missing_keys"], [])
-
-                # Checking the tensor sharing are correct
-                ptrs = defaultdict(list)
-                for k, v in model_tied.state_dict().items():
-                    ptrs[id(v)].append(k)
-
-                shared_ptrs = {k: v for k, v in ptrs.items() if len(v) > 1}
-
-                for _, shared_names in shared_ptrs.items():
-                    reloaded_ptrs = {id(reloaded_state[k]) for k in shared_names}
-                    self.assertEqual(
-                        len(reloaded_ptrs),
-                        1,
-                        f"The shared pointers are incorrect, found different pointers for keys {shared_names}",
-                    )
-
-    def test_load_save_without_tied_weights(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        config.tie_word_embeddings = False
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            with tempfile.TemporaryDirectory() as d:
-                model.save_pretrained(d)
-
-                model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True)
-                # Checking the state dicts are correct
-                reloaded_state = model_reloaded.state_dict()
-                for k, v in model.state_dict().items():
-                    self.assertIn(k, reloaded_state, f"Key {k} is missing from reloaded")
-                    assert ops.allclose(v, reloaded_state[k]), lambda x: f"{model_class.__name__}: Tensor {k}: {x}"
-                # Checking there was no complain of missing weights
-                self.assertEqual(infos["missing_keys"], [])
-
-    def test_tied_weights_keys(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        config.tie_word_embeddings = True
-        for model_class in self.all_model_classes:
-            model_tied = model_class(config)
-
-            ptrs = collections.defaultdict(list)
-            for name, tensor in model_tied.state_dict().items():
-                ptrs[id(tensor)].append(name)
-
-            # These are all the pointers of shared tensors.
-            tied_params = [names for _, names in ptrs.items() if len(names) > 1]
-
-            tied_weight_keys = model_tied._tied_weights_keys if model_tied._tied_weights_keys is not None else []
-            # Detect we get a hit for each key
-            for key in tied_weight_keys:
-                is_tied_key = any(re.search(key, p) for group in tied_params for p in group)
-                self.assertTrue(is_tied_key, f"{key} is not a tied weight key for {model_class}.")
-
-            # Removed tied weights found from tied params -> there should only be one left after
-            for key in tied_weight_keys:
-                for i in range(len(tied_params)):
-                    tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None]
-
-            tied_params = [group for group in tied_params if len(group) > 1]
-            self.assertListEqual(
-                tied_params,
-                [],
-                f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.",
-            )
-
-    def test_model_weights_reload_no_missing_tied_weights(self):
-        for model_class in self.all_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                model.save_pretrained(tmp_dir)
-
-                # We are nuking ALL weights on file, so every parameter should
-                # yell on load. We're going to detect if we yell too much, or too little.
-                placeholder_dict = {"tensor": mindspore.tensor([1, 2])}
-                safe_save_file(placeholder_dict, os.path.join(tmp_dir, "model.safetensors"), metadata={"format": "pt"})
-                model_reloaded, infos = model_class.from_pretrained(tmp_dir, output_loading_info=True)
-
-                params = dict(model_reloaded.named_parameters())
-                params.update(dict(model_reloaded.named_buffers()))
-                param_names = set(params.keys())
-
-                missing_keys = set(infos["missing_keys"])
-
-                extra_missing = missing_keys - param_names
-                # Remove tied weights from extra missing: they are normally not warned as missing if their tied
-                # counterpart is present but here there are no weights at all so we do get the warning.
-                ptrs = collections.defaultdict(list)
-                for name, tensor in model_reloaded.state_dict().items():
-                    ptrs[id_tensor_storage(tensor)].append(name)
-                tied_params = [names for _, names in ptrs.items() if len(names) > 1]
-                for group in tied_params:
-                    # We remove the group from extra_missing if not all weights from group are in it
-                    if len(set(group) - extra_missing) > 0:
-                        extra_missing = extra_missing - set(group)
-
-                self.assertEqual(
-                    extra_missing,
-                    set(),
-                    f"This model {model_class.__name__} might be missing some `keys_to_ignore`: {extra_missing}. "
-                    f"For debugging, tied parameters are {tied_params}",
-                )
-
-                missed_missing = param_names - missing_keys
-                # Remove nonpersistent buffers from missed_missing
-                buffers = [n for n, _ in model_reloaded.named_buffers()]
-                nonpersistent_buffers = {n for n in buffers if n not in model_reloaded.state_dict()}
-                missed_missing = missed_missing - nonpersistent_buffers
-
-                if model_reloaded._keys_to_ignore_on_load_missing is None:
-                    expected_missing = set()
-                else:
-                    expected_missing = set()
-                    for pattern in model_reloaded._keys_to_ignore_on_load_missing:
-                        expected_missing.update({k for k in param_names if re.search(pattern, k) is not None})
-                self.assertEqual(
-                    missed_missing,
-                    expected_missing,
-                    f"This model {model_class.__name__} ignores keys {missed_missing} but they look like real"
-                    " parameters. If they are non persistent buffers make sure to instantiate them with"
-                    " `persistent=False`",
-                )
-
-    def test_model_outputs_equivalence(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        def set_nan_tensor_to_zero(t):
-            if ON_ORANGE_PI:
-                t = ops.where(t != t, 0, t)
-            else:
-                t[t != t] = 0
-            return t
-
-        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
-            with no_grad():
-                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
-                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
-
-                def recursive_check(tuple_object, dict_object):
-                    if isinstance(tuple_object, (List, Tuple)):
-                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
-                            recursive_check(tuple_iterable_value, dict_iterable_value)
-                    elif isinstance(tuple_object, Dict):
-                        for tuple_iterable_value, dict_iterable_value in zip(
-                            tuple_object.values(), dict_object.values()
-                        ):
-                            recursive_check(tuple_iterable_value, dict_iterable_value)
-                    elif tuple_object is None:
-                        return
-                    else:
-                        self.assertTrue(
-                            ops.allclose(
-                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-4
-                            ),
-                            msg=(
-                                "Tuple and dict output are not equal. Difference:"
-                                f" {ops.max(ops.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                                f" {ops.isnan(tuple_object).any()} and `inf`: {ops.isinf(tuple_object)}. Dict has"
-                                f" `nan`: {ops.isnan(dict_object).any()} and `inf`: {ops.isinf(dict_object)}."
-                            ),
-                        )
-
-                recursive_check(tuple_output, dict_output)
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs)
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
-
-            if self.has_attentions:
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
-
-                tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-                check_equivalence(
-                    model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
-                )
-
-    # Don't copy this method to model specific test file!
-    # TODO: remove this method once the issues are all fixed!
-    def _make_attention_mask_non_null(self, inputs_dict):
-        """Make sure no sequence has all zeros as attention mask"""
-
-        for k in ["attention_mask", "encoder_attention_mask", "decoder_attention_mask"]:
-            if k in inputs_dict:
-                attention_mask = inputs_dict[k]
-
-                # Make sure no all 0s attention masks - to avoid failure at this moment.
-                # Put `1` at the beginning of sequences to make it still work when combining causal attention masks.
-                # TODO: remove this line once a fix regarding large negative values for attention mask is done.
-                attention_mask = ops.cat(
-                    [ops.ones_like(attention_mask[:, :1], dtype=attention_mask.dtype), attention_mask[:, 1:]], dim=-1
-                )
-
-                # Here we make the first sequence with all 0s as attention mask.
-                # Currently, this will fail for `TFWav2Vec2Model`. This is caused by the different large negative
-                # values, like `1e-4`, `1e-9`, `1e-30` and `-inf` for attention mask across models/frameworks.
-                # TODO: enable this block once the large negative values thing is cleaned up.
-                # (see https://github.com/huggingface/transformers/issues/14859)
-                # attention_mask = ops.cat(
-                #     [ops.zeros_like(attention_mask[:1], dtype=attention_mask.dtype), attention_mask[1:]],
-                #     dim=0
-                # )
-
-                inputs_dict[k] = attention_mask
-
-    # Don't copy this method to model specific test file!
-    # TODO: remove this method once the issues are all fixed!
-    def _postprocessing_to_ignore_test_cases(self, tf_outputs, pt_outputs, model_class):
-        """For temporarily ignoring some failed test cases (issues to be fixed)"""
-
-        tf_keys = {k for k, v in tf_outputs.items() if v is not None}
-        pt_keys = {k for k, v in pt_outputs.items() if v is not None}
-
-        key_differences = tf_keys.symmetric_difference(pt_keys)
-
-        if model_class.__name__ in [
-            "FlaubertWithLMHeadModel",
-            "FunnelForPreTraining",
-            "ElectraForPreTraining",
-            "XLMWithLMHeadModel",
-        ]:
-            for k in key_differences:
-                if k in ["loss", "losses"]:
-                    tf_keys.discard(k)
-                    pt_keys.discard(k)
-        elif model_class.__name__.startswith("GPT2"):
-            # `TFGPT2` has `past_key_values` as a tensor while `GPT2` has it as a tuple.
-            tf_keys.discard("past_key_values")
-            pt_keys.discard("past_key_values")
-
-        # create new outputs from the remaining fields
-        new_tf_outputs = type(tf_outputs)(**{k: tf_outputs[k] for k in tf_keys})
-        new_pt_outputs = type(pt_outputs)(**{k: pt_outputs[k] for k in pt_keys})
-
-        return new_tf_outputs, new_pt_outputs
-
-    def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
-        diff = np.abs((a - b)).max()
-        self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
-
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with no_grad():
-                model(**inputs)[0]
-
-    def test_inputs_embeds_matches_input_ids(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ not in get_values(MODEL_MAPPING_NAMES):
-                continue
-            model = model_class(config)
-            model.eval()
-
-            model_forward_args = inspect.signature(model.forward).parameters
-            if "inputs_embeds" not in model_forward_args:
-                self.skipTest(reason="This model doesn't use `inputs_embeds`")
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                # some models infer position ids/attn mask differently when input ids
-                # by check if pad_token let's make sure no padding is in input ids
-                not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1
-                input_ids = ops.where(input_ids == pad_token_id, not_pad_token_id, input_ids)
-                # input_ids[input_ids == pad_token_id] = not_pad_token_id
-                del inputs["input_ids"]
-                inputs_embeds = wte(input_ids)
-                with no_grad():
-                    out_ids = model(input_ids=input_ids, **inputs)[0]
-                    out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                # encoder_input_ids[encoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
-                # decoder_input_ids[decoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
-                encoder_input_ids = ops.where(encoder_input_ids == pad_token_id, max(0, pad_token_id + 1), encoder_input_ids)
-                decoder_input_ids = ops.where(decoder_input_ids == pad_token_id, max(0, pad_token_id + 1), decoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-                inputs_embeds = wte(encoder_input_ids)
-                decoder_inputs_embeds = wte(decoder_input_ids)
-                with no_grad():
-                    out_ids = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **inputs)[0]
-                    out_embeds = model(
-                        inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, **inputs
-                    )[0]
-            self.assertTrue(ops.allclose(out_embeds, out_ids))
-
-    # @require_mindspore_multi_gpu
-    # def test_multi_gpu_data_parallel_forward(self):
-    #     config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-    #     # some params shouldn't be scattered by nn.DataParallel
-    #     # so just remove them if they are present.
-    #     blacklist_non_batched_params = ["head_mask", "decoder_head_mask", "cross_attn_head_mask"]
-    #     for k in blacklist_non_batched_params:
-    #         inputs_dict.pop(k, None)
-
-    #     # move input tensors to cuda:O
-    #     for k, v in inputs_dict.items():
-    #         if torch.is_tensor(v):
-    #             inputs_dict[k] = v.to(0)
-
-    #     for model_class in self.all_model_classes:
-    #         model = model_class(config=config)
-    #         model.to(0)
-    #         model.eval()
-
-    #         # Wrap model in nn.DataParallel
-    #         model = nn.DataParallel(model)
-    #         with no_grad():
-    #             _ = model(**self._prepare_for_class(inputs_dict, model_class))
-
-    # @require_mindspore_multi_gpu
-    # def test_model_parallelization(self):
-    #     if not self.test_model_parallel:
-    #         self.skipTest(reason="test_model_parallel is set to False")
-
-    #     # a candidate for testing_utils
-    #     def get_current_gpu_memory_use():
-    #         """returns a list of cuda memory allocations per GPU in MBs"""
-
-    #         per_device_memory = []
-    #         for id in range(torch.cuda.device_count()):
-    #             with torch.cuda.device(id):
-    #                 per_device_memory.append(torch.cuda.memory_allocated() >> 20)
-
-    #         return per_device_memory
-
-    #     # Needs a large model to see the difference.
-    #     config = self.model_tester.get_large_model_config()
-
-    #     for model_class in self.all_parallelizable_model_classes:
-    #         torch.cuda.empty_cache()
-
-    #         # 1. single gpu memory load + unload + memory measurements
-    #         # Retrieve initial memory usage (can easily be ~0.6-1.5GB if cuda-kernels have been preloaded by previous tests)
-    #         memory_at_start = get_current_gpu_memory_use()
-
-    #         # Put model on device 0 and take a memory snapshot
-    #         model = model_class(config)
-    #         model.to("cuda:0")
-    #         memory_after_model_load = get_current_gpu_memory_use()
-
-    #         # The memory use on device 0 should be higher than it was initially.
-    #         self.assertGreater(memory_after_model_load[0], memory_at_start[0])
-
-    #         del model
-    #         gc.collect()
-    #         torch.cuda.empty_cache()
-
-    #         # 2. MP test
-    #         # it's essential to re-calibrate the usage before the next stage
-    #         memory_at_start = get_current_gpu_memory_use()
-
-    #         # Spread model layers over multiple devices
-    #         model = model_class(config)
-    #         model.parallelize()
-    #         memory_after_parallelization = get_current_gpu_memory_use()
-
-    #         # Assert that the memory use on all devices is higher than it was when loaded only on CPU
-    #         for n in range(len(model.device_map.keys())):
-    #             self.assertGreater(memory_after_parallelization[n], memory_at_start[n])
-
-    #         # Assert that the memory use of device 0 is lower than it was when the entire model was loaded on it
-    #         self.assertLess(memory_after_parallelization[0], memory_after_model_load[0])
-
-    #         # Assert that the memory use of device 1 is higher than it was when the entire model was loaded
-    #         # on device 0 and device 1 wasn't used at all
-    #         self.assertGreater(memory_after_parallelization[1], memory_after_model_load[1])
-
-    #         del model
-    #         gc.collect()
-    #         torch.cuda.empty_cache()
-
-    # @require_mindspore_multi_gpu
-    # def test_model_parallel_equal_results(self):
-    #     if not self.test_model_parallel:
-    #         self.skipTest(reason="test_model_parallel is set to False")
-
-    #     config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-    #     for model_class in self.all_parallelizable_model_classes:
-    #         inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-
-    #         def cast_to_device(dictionary, device):
-    #             output = {}
-    #             for k, v in dictionary.items():
-    #                 if isinstance(v, mindspore.Tensor):
-    #                     output[k] = v.to(device)
-    #                 else:
-    #                     output[k] = v
-
-    #             return output
-
-    #         model = model_class(config)
-    #         output = model(**cast_to_device(inputs_dict, "cpu"))
-
-    #         model.parallelize()
-
-    #         parallel_output = model(**cast_to_device(inputs_dict, "cuda:0"))
-
-    #         for value, parallel_value in zip(output, parallel_output):
-    #             if isinstance(value, mindspore.Tensor):
-    #                 self.assertTrue(ops.allclose(value, parallel_value.to("cpu"), atol=1e-7))
-    #             elif isinstance(value, (Tuple, List)):
-    #                 for value_, parallel_value_ in zip(value, parallel_value):
-    #                     self.assertTrue(ops.allclose(value_, parallel_value_.to("cpu"), atol=1e-7))
-
-    # @require_accelerate
-    # @mark.accelerate_tests
-    # @require_mindspore_gpu
-    # def test_disk_offload_bin(self):
-    #     config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-    #     for model_class in self.all_model_classes:
-    #         if model_class._no_split_modules is None:
-    #             continue
-
-    #         inputs_dict_class = self._prepare_for_class(inputs_dict, model_class)
-    #         model = model_class(config).eval()
-    #         set_seed(0)
-    #         base_output = model(**inputs_dict_class)
-
-    #         model_size = compute_module_sizes(model)[""]
-    #         with tempfile.TemporaryDirectory() as tmp_dir:
-    #             model.save_pretrained(tmp_dir, safe_serialization=False)
-
-    #             with self.assertRaises(ValueError):
-    #                 max_size = int(self.model_split_percents[0] * model_size)
-    #                 max_memory = {0: max_size, "cpu": max_size}
-    #                 # This errors out cause it's missing an offload folder
-    #                 new_model = model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
-
-    #             max_size = int(self.model_split_percents[1] * model_size)
-    #             max_memory = {0: max_size, "cpu": max_size}
-    #             new_model = model_class.from_pretrained(
-    #                 tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir
-    #             )
-
-    #             self.check_device_map_is_respected(new_model, new_model.hf_device_map)
-    #             set_seed(0)
-    #             new_output = new_model(**inputs_dict_class)
-
-    #             if isinstance(base_output[0], tuple) and isinstance(new_output[0], tuple):
-    #                 self.assertTrue(ops.allclose(a, b, atol=1e-5) for a, b in zip(base_output[0], new_output[0]))
-    #             else:
-    #                 self.assertTrue(ops.allclose(base_output[0], new_output[0], atol=1e-5))
-
-    # @require_accelerate
-    # @mark.accelerate_tests
-    # @require_mindspore_gpu
-    # def test_disk_offload_safetensors(self):
-    #     config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-    #     for model_class in self.all_model_classes:
-    #         if model_class._no_split_modules is None:
-    #             continue
-
-    #         inputs_dict_class = self._prepare_for_class(inputs_dict, model_class)
-    #         model = model_class(config).eval()
-    #         set_seed(0)
-    #         base_output = model(**inputs_dict_class)
-
-    #         model_size = compute_module_sizes(model)[""]
-    #         with tempfile.TemporaryDirectory() as tmp_dir:
-    #             model.save_pretrained(tmp_dir)
-
-    #             max_size = int(self.model_split_percents[1] * model_size)
-    #             max_memory = {0: max_size, "cpu": max_size}
-
-    #             # This doesn't error out as it's in safetensors and doesn't need an offload folder
-    #             new_model = model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
-
-    #             self.check_device_map_is_respected(new_model, new_model.hf_device_map)
-    #             set_seed(0)
-    #             new_output = new_model(**inputs_dict_class)
-
-    #             if isinstance(base_output[0], tuple) and isinstance(new_output[0], tuple):
-    #                 self.assertTrue(ops.allclose(a, b, atol=1e-5) for a, b in zip(base_output[0], new_output[0]))
-    #             else:
-    #                 self.assertTrue(ops.allclose(base_output[0], new_output[0], atol=1e-5))
-
-    # @require_accelerate
-    # @mark.accelerate_tests
-    # @require_mindspore_gpu
-    # def test_cpu_offload(self):
-    #     config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-    #     for model_class in self.all_model_classes:
-    #         if model_class._no_split_modules is None:
-    #             continue
-
-    #         inputs_dict_class = self._prepare_for_class(inputs_dict, model_class)
-    #         model = model_class(config).eval()
-
-    #         set_seed(0)
-    #         base_output = model(**inputs_dict_class)
-
-    #         model_size = compute_module_sizes(model)[""]
-    #         # We test several splits of sizes to make sure it works.
-    #         max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
-    #         with tempfile.TemporaryDirectory() as tmp_dir:
-    #             model.save_pretrained(tmp_dir)
-
-    #             for max_size in max_gpu_sizes:
-    #                 max_memory = {0: max_size, "cpu": model_size * 2}
-    #                 new_model = model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
-    #                 # Making sure part of the model will actually end up offloaded
-    #                 self.assertSetEqual(set(new_model.hf_device_map.values()), {0, "cpu"})
-
-    #                 self.check_device_map_is_respected(new_model, new_model.hf_device_map)
-
-    #                 set_seed(0)
-    #                 new_output = new_model(**inputs_dict_class)
-
-    #                 if isinstance(base_output[0], tuple) and isinstance(new_output[0], tuple):
-    #                     self.assertTrue(ops.allclose(a, b, atol=1e-5) for a, b in zip(base_output[0], new_output[0]))
-    #                 else:
-    #                     self.assertTrue(ops.allclose(base_output[0], new_output[0], atol=1e-5))
-
-    # @require_accelerate
-    # @mark.accelerate_tests
-    # @require_mindspore_multi_accelerator
-    # def test_model_parallelism(self):
-    #     config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-    #     for model_class in self.all_model_classes:
-    #         if model_class._no_split_modules is None:
-    #             continue
-
-    #         inputs_dict_class = self._prepare_for_class(inputs_dict, model_class)
-    #         model = model_class(config).eval()
-
-    #         set_seed(0)
-    #         base_output = model(**inputs_dict_class)
-
-    #         model_size = compute_module_sizes(model)[""]
-    #         # We test several splits of sizes to make sure it works.
-    #         max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]]
-    #         with tempfile.TemporaryDirectory() as tmp_dir:
-    #             model.save_pretrained(tmp_dir)
-
-    #             for max_size in max_gpu_sizes:
-    #                 max_memory = {0: max_size, 1: model_size * 2, "cpu": model_size * 2}
-    #                 new_model = model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
-    #                 # Making sure part of the model will actually end up offloaded
-    #                 self.assertSetEqual(set(new_model.hf_device_map.values()), {0, 1})
-    #                 self.check_device_map_is_respected(new_model, new_model.hf_device_map)
-
-    #                 set_seed(0)
-    #                 new_output = new_model(**inputs_dict_class)
-
-    #                 if isinstance(base_output[0], tuple) and isinstance(new_output[0], tuple):
-    #                     self.assertTrue(ops.allclose(a, b, atol=1e-5) for a, b in zip(base_output[0], new_output[0]))
-    #                 else:
-    #                     self.assertTrue(ops.allclose(base_output[0], new_output[0], atol=1e-5))
-
-    def test_problem_types(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        problem_types = [
-            {"title": "multi_label_classification", "num_labels": 2, "dtype": mindspore.float32},
-            {"title": "single_label_classification", "num_labels": 1, "dtype": mindspore.int64},
-            {"title": "regression", "num_labels": 1, "dtype": mindspore.float32},
-        ]
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ not in [
-                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES),
-                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES),
-            ]:
-                continue
-
-            for problem_type in problem_types:
-                with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-                    config.problem_type = problem_type["title"]
-                    config.num_labels = problem_type["num_labels"]
-
-                    model = model_class(config)
-                    model.train()
-
-                    inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
-                    if problem_type["num_labels"] > 1:
-                        inputs["labels"] = inputs["labels"].unsqueeze(1).tile((1, problem_type["num_labels"]))
-
-                    inputs["labels"] = inputs["labels"].to(problem_type["dtype"])
-
-                    # This tests that we do not trigger the warning form PyTorch "Using a target size that is different
-                    # to the input size. This will likely lead to incorrect results due to broadcasting. Please ensure
-                    # they have the same size." which is a symptom something in wrong for the regression problem.
-                    # See https://github.com/huggingface/transformers/issues/11780
-                    with warnings.catch_warnings(record=True) as warning_list:
-                        loss = model(**inputs).loss
-                    for w in warning_list:
-                        if "Using a target size that is different to the input size" in str(w.message):
-                            raise ValueError(
-                                f"Something is going wrong in the regression problem: intercepted {w.message}"
-                            )
-
-                    # loss.backward()
-
-    def test_load_with_mismatched_shapes(self):
-        if not self.test_mismatched_shapes:
-            self.skipTest(reason="test_missmatched_shapes is set to False")
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ not in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES):
-                continue
-
-            with self.subTest(msg=f"Testing {model_class}"):
-                with tempfile.TemporaryDirectory() as tmp_dir:
-                    model = model_class(config)
-                    model.save_pretrained(tmp_dir)
-
-                    # Fails when we don't set ignore_mismatched_sizes=True
-                    with self.assertRaises(RuntimeError):
-                        new_model = AutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42)
-                    with self.assertRaises(RuntimeError):
-                        new_model_without_prefix = AutoModel.from_pretrained(tmp_dir, vocab_size=10)
-
-                    logger = logging.get_logger("mindnlp.transformers.modeling_utils")
-
-                    with CaptureLogger(logger) as cl:
-                        new_model = AutoModelForSequenceClassification.from_pretrained(
-                            tmp_dir, num_labels=42, ignore_mismatched_sizes=True
-                        )
-                    self.assertIn("the shapes did not match", cl.out)
-                    inputs = self._prepare_for_class(inputs_dict, model_class)
-                    logits = new_model(**inputs).logits
-                    self.assertEqual(logits.shape[1], 42)
-
-                    with CaptureLogger(logger) as cl:
-                        new_model_without_prefix = AutoModel.from_pretrained(
-                            tmp_dir, vocab_size=10, ignore_mismatched_sizes=True
-                        )
-                    self.assertIn("the shapes did not match", cl.out)
-                    input_ids = ids_tensor((2, 8), 10)
-                    if self.is_encoder_decoder:
-                        new_model_without_prefix(input_ids, decoder_input_ids=input_ids)
-                    else:
-                        new_model_without_prefix(input_ids)
-
-    def test_mismatched_shapes_have_properly_initialized_weights(self):
-        if not self.test_mismatched_shapes:
-            self.skipTest(reason="test_missmatched_shapes is set to False")
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        configs_no_init = _config_zero_init(config)
-
-        for model_class in self.all_model_classes:
-            mappings = [
-                MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
-                MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
-                MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
-                MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES,
-            ]
-            is_classication_model = any(model_class.__name__ in get_values(mapping) for mapping in mappings)
-
-            if not is_classication_model:
-                continue
-
-            # TODO: ydshieh
-            is_special_classes = model_class.__name__ in [
-                "wav2vec2.masked_spec_embed",
-                "Wav2Vec2ForSequenceClassification",
-                "CLIPForImageClassification",
-                "RegNetForImageClassification",
-                "ResNetForImageClassification",
-                "UniSpeechSatForSequenceClassification",
-                "Wav2Vec2BertForSequenceClassification",
-                "PvtV2ForImageClassification",
-                "Wav2Vec2ConformerForSequenceClassification",
-                "WavLMForSequenceClassification",
-                "SwiftFormerForImageClassification",
-                "SEWForSequenceClassification",
-                "BitForImageClassification",
-                "SEWDForSequenceClassification",
-                "SiglipForImageClassification",
-                "HubertForSequenceClassification",
-                "Swinv2ForImageClassification",
-                "Data2VecAudioForSequenceClassification",
-                "UniSpeechForSequenceClassification",
-                "PvtForImageClassification",
-            ]
-            special_param_names = [
-                r"^bit\.",
-                r"^classifier\.weight",
-                r"^classifier\.bias",
-                r"^classifier\..+\.weight",
-                r"^classifier\..+\.bias",
-                r"^data2vec_audio\.",
-                r"^dist_head\.",
-                r"^head\.",
-                r"^hubert\.",
-                r"^pvt\.",
-                r"^pvt_v2\.",
-                r"^regnet\.",
-                r"^resnet\.",
-                r"^sew\.",
-                r"^sew_d\.",
-                r"^swiftformer\.",
-                r"^swinv2\.",
-                r"^transformers\.models\.swiftformer\.",
-                r"^unispeech\.",
-                r"^unispeech_sat\.",
-                r"^vision_model\.",
-                r"^wav2vec2\.",
-                r"^wav2vec2_bert\.",
-                r"^wav2vec2_conformer\.",
-                r"^wavlm\.",
-            ]
-
-            with self.subTest(msg=f"Testing {model_class}"):
-                with tempfile.TemporaryDirectory() as tmp_dir:
-                    model = model_class(configs_no_init)
-                    model.save_pretrained(tmp_dir)
-
-                    # Fails when we don't set ignore_mismatched_sizes=True
-                    with self.assertRaises(RuntimeError):
-                        new_model = model_class.from_pretrained(tmp_dir, num_labels=42)
-
-                    logger = logging.get_logger("mindnlp.transformers.modeling_utils")
-
-                    with CaptureLogger(logger) as cl:
-                        new_model = model_class.from_pretrained(tmp_dir, num_labels=42, ignore_mismatched_sizes=True)
-                    self.assertIn("the shapes did not match", cl.out)
-
-                    for name, param in new_model.named_parameters():
-                        if param.requires_grad:
-                            param_mean = ((ops.mean(param) * 1e9).round() / 1e9).item()
-                            if not (
-                                is_special_classes
-                                and any(len(re.findall(target, name)) > 0 for target in special_param_names)
-                            ):
-                                self.assertIn(
-                                    param_mean,
-                                    [0.0, 1.0],
-                                    msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                                )
-                            else:
-                                # Here we allow the parameters' mean to be in the range [-5.0, 5.0] instead of being
-                                # either `0.0` or `1.0`, because their initializations are not using
-                                # `config.initializer_factor` (or something similar). The purpose of this test is simply
-                                # to make sure they are properly initialized (to avoid very large value or even `nan`).
-                                self.assertGreaterEqual(
-                                    param_mean,
-                                    -5.0,
-                                    msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                                )
-                                self.assertLessEqual(
-                                    param_mean,
-                                    5.0,
-                                    msg=f"Parameter {name} of model {model_class} seems not properly initialized",
-                                )
-
-    def test_matched_shapes_have_loaded_weights_when_some_mismatched_shapes_exist(self):
-        # 1. Create a dummy class. Should have buffers as well? To make sure we test __init__
-        class MyClass(PreTrainedModel):
-            config_class = PretrainedConfig
-
-            def __init__(self, config=None):
-                super().__init__(config if config is not None else PretrainedConfig())
-                self.linear = nn.Linear(10, config.num_labels, bias=True)
-                self.embedding = nn.Embedding(10, 10)
-                self.std = 1
-
-            def _init_weights(self, module):
-                if isinstance(module, nn.Linear):
-                    nn.init.kaiming_uniform_(module.weight, np.sqrt(5))
-                    if module.bias is not None:
-                        nn.init.normal_(module.bias, mean=0.0, std=self.std)
-
-        # Used to make sure the weights with matched shape are loaded correctly
-        config = PretrainedConfig()
-        config.num_labels = 3
-        model = MyClass(config=config)
-
-        # Used to make sure the weights with mismatched shape are properly initialized
-        set_seed(0)
-        config = PretrainedConfig()
-        config.num_labels = 4
-        # not to init. the weights during the creation: to match the logic in `from_pretrained`, so we can keep the
-        # same sequence of random ops in the execution path to allow us to compare `target_model` and `new_model` below
-        # for `linear` part.
-        with ContextManagers([no_init_weights(True)]):
-            target_model = MyClass(config=config)
-        target_model.apply(target_model._initialize_weights)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            state_dict = model.state_dict()
-            del state_dict["linear.weight"]
-
-            model.config.save_pretrained(tmpdirname)
-            save_checkpoint(state_dict, os.path.join(tmpdirname, "mindspore_model.ckpt"))
-
-            set_seed(0)
-            new_model = MyClass.from_pretrained(tmpdirname, num_labels=4, ignore_mismatched_sizes=True)
-
-            for key in new_model.state_dict().keys():
-                # check weight values for weights with matched shapes are identical
-                # (i.e. correctly loaded from the checkpoint)
-                if key not in ["linear.weight", "linear.bias"]:
-                    max_diff = ops.max(ops.abs(model.state_dict()[key] - new_model.state_dict()[key]))
-                    self.assertLessEqual(
-                        max_diff.item(),
-                        1e-6,
-                        msg=f"the weight values for `{key}` in `new_model` and `model` are  not identical",
-                    )
-                else:
-                    # check we have some mismatched shapes
-                    self.assertNotEqual(
-                        model.state_dict()[key].shape,
-                        new_model.state_dict()[key].shape,
-                        msg=f"the weight shapes for {key} in `model` and `new_model` should differ",
-                    )
-                    # check the weights with mismatched shape are properly initialized
-                    max_diff = ops.max(ops.abs(new_model.state_dict()[key] - target_model.state_dict()[key]))
-                    self.assertLessEqual(
-                        max_diff.item(),
-                        1e-6,
-                        msg=f"the weight values for `{key}` in `new_model` and `target_model` are not identical",
-                    )
-
-    def test_model_is_small(self):
-        # Just a consistency check to make sure we are not running tests on 80M parameter models.
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            num_params = model.num_parameters()
-            assert (
-                num_params < 1000000
-            ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
-
-    # @require_flash_attn
-    # @require_mindspore_gpu
-    # @mark.flash_attn_test
-    # @slow
-    # def test_flash_attn_2_conversion(self):
-    #     if not self.has_attentions:
-    #         self.skipTest(reason="Model architecture does not support attentions")
-
-    #     config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-    #     for model_class in self.all_model_classes:
-    #         if not model_class._supports_flash_attn_2:
-    #             self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-    #         model = model_class(config)
-
-    #         with tempfile.TemporaryDirectory() as tmpdirname:
-    #             model.save_pretrained(tmpdirname)
-    #             model = model_class.from_pretrained(
-    #                 tmpdirname, ms_dtype=mindspore.float16, attn_implementation="flash_attention_2"
-    #             )
-
-    #             for _, module in model.named_modules():
-    #                 if "FlashAttention" in module.__class__.__name__:
-    #                     return
-
-    #             self.assertTrue(False, "FlashAttention2 modules not found in model")
-
-    # @require_flash_attn
-    # @require_mindspore_gpu
-    # @mark.flash_attn_test
-    # @slow
-    # @is_flaky()
-    # def test_flash_attn_2_inference_equivalence(self):
-    #     if not self.has_attentions:
-    #         self.skipTest(reason="Model architecture does not support attentions")
-
-    #     for model_class in self.all_model_classes:
-    #         if not model_class._supports_flash_attn_2:
-    #             self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-    #         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    #         model = model_class(config)
-
-    #         with tempfile.TemporaryDirectory() as tmpdirname:
-    #             model.save_pretrained(tmpdirname)
-    #             model_fa = model_class.from_pretrained(
-    #                 tmpdirname, ms_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
-    #             )
-
-    #             model = model_class.from_pretrained(tmpdirname, ms_dtype=torch.bfloat16)
-    #
-    #             dummy_input = inputs_dict[model.main_input_name][:1]
-    #             if dummy_input.dtype in [mindspore.float32, mindspore.float16]:
-    #                 dummy_input = dummy_input.to(torch.bfloat16)
-
-    #             dummy_attention_mask = inputs_dict.get("attention_mask", None)
-
-    #             if dummy_attention_mask is not None:
-    #                 dummy_attention_mask = dummy_attention_mask[:1]
-    #                 dummy_attention_mask[:, 1:] = 1
-    #                 dummy_attention_mask[:, :1] = 0
-
-    #             if model.config.is_encoder_decoder:
-    #                 decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[:1]
-
-    #                 outputs = model(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
-    #                 outputs_fa = model_fa(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
-    #             else:
-    #                 outputs = model(dummy_input, output_hidden_states=True)
-    #                 outputs_fa = model_fa(dummy_input, output_hidden_states=True)
-
-    #             logits = (
-    #                 outputs.hidden_states[-1]
-    #                 if not model.config.is_encoder_decoder
-    #                 else outputs.decoder_hidden_states[-1]
-    #             )
-    #             logits_fa = (
-    #                 outputs_fa.hidden_states[-1]
-    #                 if not model.config.is_encoder_decoder
-    #                 else outputs_fa.decoder_hidden_states[-1]
-    #             )
-
-    #             assert ops.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2)
-
-    #             if model.config.is_encoder_decoder:
-    #                 other_inputs = {
-    #                     "decoder_input_ids": decoder_input_ids,
-    #                     "decoder_attention_mask": dummy_attention_mask,
-    #                     "output_hidden_states": True,
-    #                 }
-    #                 if dummy_attention_mask is not None:
-    #                     other_inputs["attention_mask"] = dummy_attention_mask
-
-    #                 outputs = model(dummy_input, **other_inputs)
-    #                 outputs_fa = model_fa(dummy_input, **other_inputs)
-    #             else:
-    #                 other_inputs = {
-    #                     "output_hidden_states": True,
-    #                 }
-    #                 if dummy_attention_mask is not None:
-    #                     other_inputs["attention_mask"] = dummy_attention_mask
-
-    #                 outputs = model(dummy_input, **other_inputs)
-    #                 outputs_fa = model_fa(dummy_input, **other_inputs)
-
-    #             logits = (
-    #                 outputs.hidden_states[-1]
-    #                 if not model.config.is_encoder_decoder
-    #                 else outputs.decoder_hidden_states[-1]
-    #             )
-    #             logits_fa = (
-    #                 outputs_fa.hidden_states[-1]
-    #                 if not model.config.is_encoder_decoder
-    #                 else outputs_fa.decoder_hidden_states[-1]
-    #             )
-
-    #             assert ops.allclose(logits_fa[1:], logits[1:], atol=4e-2, rtol=4e-2)
-
-    #             # check with inference + dropout
-    #             model.train()
-    #             _ = model_fa(dummy_input, **other_inputs)
-
-    # @require_flash_attn
-    # @require_mindspore_gpu
-    # @mark.flash_attn_test
-    # @slow
-    # @is_flaky()
-    # def test_flash_attn_2_inference_equivalence_right_padding(self):
-    #     if not self.has_attentions:
-    #         self.skipTest(reason="Model architecture does not support attentions")
-
-    #     for model_class in self.all_model_classes:
-    #         if not model_class._supports_flash_attn_2:
-    #             self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-    #         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    #         model = model_class(config)
-
-    #         with tempfile.TemporaryDirectory() as tmpdirname:
-    #             model.save_pretrained(tmpdirname)
-    #             model_fa = model_class.from_pretrained(
-    #                 tmpdirname, ms_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
-    #             )
-
-    #             model = model_class.from_pretrained(tmpdirname, ms_dtype=torch.bfloat16)
-    # 
-    #             dummy_input = inputs_dict[model.main_input_name][:1]
-    #             if dummy_input.dtype in [mindspore.float32, mindspore.float16]:
-    #                 dummy_input = dummy_input.to(torch.bfloat16)
-
-    #             dummy_attention_mask = inputs_dict.get("attention_mask", None)
-
-    #             if dummy_attention_mask is not None:
-    #                 dummy_attention_mask = dummy_attention_mask[:1]
-    #                 dummy_attention_mask[:, :-1] = 1
-    #                 dummy_attention_mask[:, -1:] = 0
-
-    #             if model.config.is_encoder_decoder:
-    #                 decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[:1]
-
-    #                 outputs = model(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
-    #                 outputs_fa = model_fa(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
-    #             else:
-    #                 outputs = model(dummy_input, output_hidden_states=True)
-    #                 outputs_fa = model_fa(dummy_input, output_hidden_states=True)
-
-    #             logits = (
-    #                 outputs.hidden_states[-1]
-    #                 if not model.config.is_encoder_decoder
-    #                 else outputs.decoder_hidden_states[-1]
-    #             )
-    #             logits_fa = (
-    #                 outputs_fa.hidden_states[-1]
-    #                 if not model.config.is_encoder_decoder
-    #                 else outputs_fa.decoder_hidden_states[-1]
-    #             )
-
-    #             assert ops.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2)
-
-    #             if model.config.is_encoder_decoder:
-    #                 other_inputs = {
-    #                     "decoder_input_ids": decoder_input_ids,
-    #                     "decoder_attention_mask": dummy_attention_mask,
-    #                     "output_hidden_states": True,
-    #                 }
-    #                 if dummy_attention_mask is not None:
-    #                     other_inputs["attention_mask"] = dummy_attention_mask
-
-    #                 outputs = model(dummy_input, **other_inputs)
-    #                 outputs_fa = model_fa(dummy_input, **other_inputs)
-    #             else:
-    #                 other_inputs = {
-    #                     "output_hidden_states": True,
-    #                 }
-    #                 if dummy_attention_mask is not None:
-    #                     other_inputs["attention_mask"] = dummy_attention_mask
-
-    #                 outputs = model(dummy_input, **other_inputs)
-    #                 outputs_fa = model_fa(dummy_input, **other_inputs)
-
-    #             logits = (
-    #                 outputs.hidden_states[-1]
-    #                 if not model.config.is_encoder_decoder
-    #                 else outputs.decoder_hidden_states[-1]
-    #             )
-    #             logits_fa = (
-    #                 outputs_fa.hidden_states[-1]
-    #                 if not model.config.is_encoder_decoder
-    #                 else outputs_fa.decoder_hidden_states[-1]
-    #             )
-
-    #             assert ops.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)
-
-    # @require_flash_attn
-    # @require_mindspore_gpu
-    # @mark.flash_attn_test
-    # @slow
-    # @is_flaky()
-    # def test_flash_attn_2_generate_left_padding(self):
-    #     if not self.has_attentions:
-    #         self.skipTest(reason="Model architecture does not support attentions")
-
-    #     for model_class in self.all_generative_model_classes:
-    #         if not model_class._supports_flash_attn_2:
-    #             self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-    #         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    #         model = model_class(config)
-
-    #         with tempfile.TemporaryDirectory() as tmpdirname:
-    #             model.save_pretrained(tmpdirname)
-    #             model = model_class.from_pretrained(tmpdirname, ms_dtype=mindspore.float16, low_cpu_mem_usage=True).to(
-    #                 torch_device
-    #             )
-
-    #             dummy_input = inputs_dict[model.main_input_name]
-    #             if dummy_input.dtype in [mindspore.float32, torch.bfloat16]:
-    #                 dummy_input = dummy_input.to(mindspore.float16)
-
-    #             dummy_attention_mask = inputs_dict.get("attention_mask", ops.ones_like(dummy_input))
-    #             # make sure we do left padding
-    #             dummy_attention_mask[:, :-1] = 0
-    #             dummy_attention_mask[:, -1:] = 1
-
-    #             out = model.generate(
-    #                 dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-    #             )
-
-    #             model = model_class.from_pretrained(
-    #                 tmpdirname,
-    #                 ms_dtype=mindspore.float16,
-    #                 attn_implementation="flash_attention_2",
-    #                 low_cpu_mem_usage=True,
-    #             )
-
-    #             out_fa = model.generate(
-    #                 dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-    #             )
-
-    #             self.assertTrue(ops.allclose(out, out_fa))
-
-    # @require_flash_attn
-    # @require_mindspore_gpu
-    # @mark.flash_attn_test
-    # @is_flaky()
-    # @slow
-    # def test_flash_attn_2_generate_padding_right(self):
-    #     if not self.has_attentions:
-    #         self.skipTest(reason="Model architecture does not support attentions")
-
-    #     for model_class in self.all_generative_model_classes:
-    #         if not model_class._supports_flash_attn_2:
-    #             self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-    #         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    #         model = model_class(config)
-
-    #         with tempfile.TemporaryDirectory() as tmpdirname:
-    #             model.save_pretrained(tmpdirname)
-    #             model = model_class.from_pretrained(tmpdirname, ms_dtype=mindspore.float16, low_cpu_mem_usage=True).to(
-    #                 torch_device
-    #             )
-
-    #             dummy_input = inputs_dict[model.main_input_name]
-    #             if dummy_input.dtype in [mindspore.float32, torch.bfloat16]:
-    #                 dummy_input = dummy_input.to(mindspore.float16)
-
-    #             dummy_attention_mask = inputs_dict.get("attention_mask", ops.ones_like(dummy_input))
-    #             # make sure we do right padding
-    #             dummy_attention_mask[:, :-1] = 1
-    #             dummy_attention_mask[:, -1:] = 0
-
-    #             out = model.generate(
-    #                 dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-    #             )
-
-    #             model = model_class.from_pretrained(
-    #                 tmpdirname,
-    #                 ms_dtype=mindspore.float16,
-    #                 attn_implementation="flash_attention_2",
-    #                 low_cpu_mem_usage=True,
-    #             )
-
-    #             out_fa = model.generate(
-    #                 dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-    #             )
-
-    #             self.assertTrue(ops.allclose(out, out_fa))
-
-
-    # @require_flash_attn
-    # @require_mindspore_gpu
-    # @mark.flash_attn_test
-    # @slow
-    # def test_flash_attn_2_generate_use_cache(self):
-    #     if not self.has_attentions:
-    #         self.skipTest(reason="Model architecture does not support attentions")
-
-    #     max_new_tokens = 30
-
-    #     for model_class in self.all_generative_model_classes:
-    #         if not model_class._supports_flash_attn_2:
-    #             self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-    #         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-    #         dummy_input = inputs_dict[model_class.main_input_name]
-    #         if dummy_input.dtype in [mindspore.float32, torch.bfloat16]:
-    #             dummy_input = dummy_input.to(mindspore.float16)
-
-    #         # make sure that all models have enough positions for generation
-    #         if hasattr(config, "max_position_embeddings"):
-    #             config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-    #         model = model_class(config)
-
-    #         with tempfile.TemporaryDirectory() as tmpdirname:
-    #             model.save_pretrained(tmpdirname)
-
-    #             dummy_attention_mask = inputs_dict.get("attention_mask", ops.ones_like(dummy_input))
-
-    #             model = model_class.from_pretrained(
-    #                 tmpdirname,
-    #                 ms_dtype=mindspore.float16,
-    #                 attn_implementation="flash_attention_2",
-    #                 low_cpu_mem_usage=True,
-    #             )
-
-    #             # Just test that a large cache works as expected
-    #             _ = model.generate(
-    #                 dummy_input,
-    #                 attention_mask=dummy_attention_mask,
-    #                 max_new_tokens=max_new_tokens,
-    #                 do_sample=False,
-    #                 use_cache=True,
-    #             )
-
-    #             # Generate with one batch only to test generation when attention mask will be None
-    #             # when real inputs are used, because there is no padding. See issue #32237 for more
-    #             dummy_input = dummy_input[:1, ...]
-    #             dummy_attention_mask = ops.ones_like(dummy_attention_mask[:1, ...])
-    #             _ = model.generate(
-    #                 dummy_input,
-    #                 attention_mask=dummy_attention_mask,
-    #                 max_new_tokens=max_new_tokens,
-    #                 do_sample=False,
-    #                 use_cache=True,
-    #             )
-
-    # @require_flash_attn
-    # @require_mindspore_gpu
-    # @require_bitsandbytes
-    # @mark.flash_attn_test
-    # @slow
-    # def test_flash_attn_2_fp32_ln(self):
-    #     if not self.has_attentions:
-    #         self.skipTest(reason="Model architecture does not support attentions")
-
-    #     for model_class in self.all_generative_model_classes:
-    #         if not model_class._supports_flash_attn_2:
-    #             self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-    #         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    #         model = model_class(config)
-    #         with tempfile.TemporaryDirectory() as tmpdirname:
-    #             model.save_pretrained(tmpdirname)
-
-    #             dummy_input = inputs_dict[model.main_input_name]
-    #             dummy_attention_mask = inputs_dict.get("attention_mask", ops.ones_like(dummy_input))
-    #             batch_size = dummy_attention_mask.shape[0]
-
-    #             is_padding_right = dummy_attention_mask[:, -1].sum().item() != batch_size
-
-    #             # To avoid errors with padding_side=="right"
-    #             if is_padding_right:
-    #                 dummy_attention_mask = ops.ones_like(dummy_input)
-
-    #             model = model_class.from_pretrained(
-    #                 tmpdirname,
-    #                 ms_dtype=mindspore.float16,
-    #                 attn_implementation="flash_attention_2",
-    #                 low_cpu_mem_usage=True,
-    #                 load_in_4bit=True,
-    #             )
-
-    #             for _, param in model.named_parameters():
-    #                 # upcast only layer norms
-    #                 if (param.dtype == mindspore.float16) or (param.dtype == torch.bfloat16):
-    #                     param = param.to(mindspore.float32)
-
-    #             if model.config.is_encoder_decoder:
-    #                 dummy_decoder_input_ids = inputs_dict["decoder_input_ids"]
-    #                 dummy_decoder_attention_mask = inputs_dict["decoder_attention_mask"]
-
-    #                 _ = model(dummy_input, decoder_input_ids=dummy_decoder_input_ids)
-    #                 # with attention mask
-    #                 _ = model(
-    #                     dummy_input,
-    #                     attention_mask=dummy_attention_mask,
-    #                     decoder_input_ids=dummy_decoder_input_ids,
-    #                     decoder_attention_mask=dummy_decoder_attention_mask,
-    #                 )
-    #             else:
-    #                 _ = model(dummy_input)
-    #                 # with attention mask
-    #                 _ = model(dummy_input, attention_mask=dummy_attention_mask)
-
-    # @require_flash_attn
-    # @require_mindspore_gpu
-    # @mark.flash_attn_test
-    # @slow
-    # def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
-    #     if not self.has_attentions:
-    #         self.skipTest(reason="Model architecture does not support attentions")
-
-    #     max_new_tokens = 30
-
-    #     for model_class in self.all_generative_model_classes:
-    #         if not model_class._supports_flash_attn_2:
-    #             self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-    #         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    #         if 0 not in inputs_dict.get("attention_mask", []) or "attention_mask" not in inputs_dict:
-    #             self.skipTest("Model dummy inputs should contain padding in their attention mask")
-
-    #         dummy_input = inputs_dict[model_class.main_input_name]
-    #         if dummy_input.dtype in [mindspore.float32, torch.bfloat16]:
-    #             dummy_input = dummy_input.to(mindspore.float16)
-
-    #         # make sure that all models have enough positions for generation
-    #         if hasattr(config, "max_position_embeddings"):
-    #             config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-    #         model = model_class(config)
-
-    #         with tempfile.TemporaryDirectory() as tmpdirname:
-    #             model.save_pretrained(tmpdirname)
-
-    #             # ensure left padding, to adapt for some models
-    #             if 0 in inputs_dict["attention_mask"][:, -1]:
-    #                 inputs_dict["attention_mask"] = inputs_dict["attention_mask"].flip(1)
-    #             dummy_attention_mask = inputs_dict["attention_mask"]
-    #             inputs_dict["input_ids"][~dummy_attention_mask.bool()] = config.pad_token_id
-
-    #             model = (
-    #                 model_class.from_pretrained(
-    #                     tmpdirname,
-    #                     ms_dtype=mindspore.float16,
-    #                     attn_implementation="flash_attention_2",
-    #                     low_cpu_mem_usage=True,
-    #                 )
-    #                 .eval()
-    #             )
-
-    #             # flatten
-    #             padfree_inputs_dict = {
-    #                 k: v[dummy_attention_mask.bool()].unsqueeze(0)
-    #                 for k, v in inputs_dict.items()
-    #                 if not k == "attention_mask"
-    #             }
-    #             # add position_ids
-    #             padfree_inputs_dict["position_ids"] = (
-    #                 ops.cat([torch.arange(length) for length in dummy_attention_mask.sum(1).tolist()])
-    #                 .long()
-    #                 .unsqueeze(0)
-    #             )
-
-    #             res_padded = model(**inputs_dict)
-    #             res_padfree = model(**padfree_inputs_dict)
-
-    #             logits_padded = res_padded.logits[inputs_dict["attention_mask"].bool()]
-    #             logits_padfree = res_padfree.logits[0]
-
-    #             torch.testing.assert_close(logits_padded.argmax(-1), logits_padfree.argmax(-1), atol=0, rtol=0)
-    #             # acceptable numerical instability
-    #             tol = torch.finfo(mindspore.float16).eps
-    #             torch.testing.assert_close(logits_padded, logits_padfree, atol=tol, rtol=tol)
-
-    # @require_flash_attn
-    # @require_mindspore_gpu
-    # @mark.flash_attn_test
-    # @slow
-    # def test_flash_attn_2_from_config(self):
-    #     if not self.has_attentions:
-    #         self.skipTest(reason="Model architecture does not support attentions")
-
-    #     for model_class in self.all_generative_model_classes:
-    #         if not model_class._supports_flash_attn_2:
-    #             self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-    #         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-    #         # TODO: to change it in the future with other relevant auto classes
-    #         fa2_model = AutoModelForCausalLM.from_config(
-    #             config, attn_implementation="flash_attention_2", ms_dtype=torch.bfloat16
-    #         )
-
-    #         dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]])
-    #         dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [0, 1, 1, 1]])
-
-    #         fa2_correctly_converted = False
-
-    #         for _, module in fa2_model.named_modules():
-    #             if "FlashAttention" in module.__class__.__name__:
-    #                 fa2_correctly_converted = True
-    #                 break
-
-    #         self.assertTrue(fa2_correctly_converted)
-
-    #         _ = fa2_model(input_ids=dummy_input, attention_mask=dummy_attention_mask)
-
-    #         with tempfile.TemporaryDirectory() as tmpdirname:
-    #             fa2_model.save_pretrained(tmpdirname)
-
-    #             model_from_pretrained = AutoModelForCausalLM.from_pretrained(tmpdirname)
-
-    #             self.assertTrue(model_from_pretrained.config._attn_implementation != "flash_attention_2")
-
-    #             fa2_correctly_converted = False
-
-    #             for _, module in model_from_pretrained.named_modules():
-    #                 if "FlashAttention" in module.__class__.__name__:
-    #                     fa2_correctly_converted = True
-    #                     break
-
-    #             self.assertFalse(fa2_correctly_converted)
-
-    def _get_custom_4d_mask_test_data(self):
-        # Sequence in which all but the last token is the same
-        input_ids = mindspore.tensor(
-            [[10, 11, 12, 13], [10, 11, 12, 14], [10, 11, 12, 15]], dtype=mindspore.int64
-        )
-        position_ids = mindspore.tensor([[0, 1, 2, 3]] * 3, dtype=mindspore.int64)
-
-        # Combining common prefix with the unique ending tokens:
-        input_ids_shared_prefix = ops.cat([input_ids[0][:-1], input_ids[:, -1]]).unsqueeze(0)
-
-        # Creating a 4D mask where each of the last 3 tokens do not attend to each other.
-        mask_shared_prefix = mindspore.tensor(
-            [
-                [
-                    [
-                        [1, 0, 0, 0, 0, 0],
-                        [1, 1, 0, 0, 0, 0],
-                        [1, 1, 1, 0, 0, 0],
-                        [1, 1, 1, 1, 0, 0],
-                        [1, 1, 1, 0, 1, 0],
-                        [1, 1, 1, 0, 0, 1],
-                    ]
-                ]
-            ],
-        )
-        # inverting the attention mask
-        mask_dtype = mindspore.float32
-        min_dtype = float(ops.finfo(mask_dtype).min)
-        mask_shared_prefix = (mask_shared_prefix.eq(0.0)).to(dtype=mask_dtype) * min_dtype
-
-        # Creating a position_ids tensor. note the repeating figures in the end.
-        position_ids_shared_prefix = mindspore.tensor([[0, 1, 2, 3, 3, 3]], dtype=mindspore.int64)
-        return input_ids, position_ids, input_ids_shared_prefix, mask_shared_prefix, position_ids_shared_prefix
-
-    def test_custom_4d_attention_mask(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if len(self.all_generative_model_classes) == 0:
-            self.skipTest(
-                reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks"
-            )
-
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_static_cache:
-                self.skipTest(f"{model_class.__name__} is not guaranteed to work with custom 4D attention masks")
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            if getattr(config, "sliding_window", 0) > 0:
-                self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test")
-            model = model_class(config).to(dtype=mindspore.float32)
-
-            (
-                input_ids,
-                position_ids,
-                input_ids_shared_prefix,
-                mask_shared_prefix,
-                position_ids_shared_prefix,
-            ) = self._get_custom_4d_mask_test_data()
-
-            logits = model.forward(input_ids, position_ids=position_ids).logits
-            # logits.shape == torch.Size([3, 4, ...])
-
-            logits_shared_prefix = model(
-                input_ids_shared_prefix,
-                attention_mask=mask_shared_prefix,
-                position_ids=position_ids_shared_prefix,
-            )[0]
-            # logits_shared_prefix.shape == torch.Size([1, 6, ...])
-
-            out_last_tokens = logits[:, -1, :]  # last tokens in each batch line
-            out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :]  # last three tokens
-            # comparing softmax-normalized logits:
-            normalized_0 = F.softmax(out_last_tokens)
-            normalized_1 = F.softmax(out_shared_prefix_last_tokens)
-            assert ops.allclose(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
-
-    # # For now, Let's focus only on GPU for `torch.compile`
-    # @slow
-    # @require_mindspore_gpu
-    # @require_read_token
-    # def test_torch_compile(self):
-    #     if version.parse(torch.__version__) < version.parse("2.3"):
-    #         self.skipTest(reason="This test requires torch >= 2.3 to run.")
-
-    #     if not hasattr(self, "_torch_compile_test_ckpt"):
-    #         self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
-    #     ckpt = self._torch_compile_test_ckpt
-
-    #     os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-    #     batch_size = 1
-    #     n_iter = 3
-
-    #     tokenizer = AutoTokenizer.from_pretrained(ckpt)
-    #     model = AutoModelForCausalLM.from_pretrained(ckpt, ms_dtype=mindspore.float16)
-
-    #     model.generation_config.max_new_tokens = 4
-
-    #     model.generation_config.cache_implementation = "static"
-    #     model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
-
-    #     input_text = "Why dogs are cute?"
-    #     input_ids = tokenizer([input_text] * batch_size, return_tensors="ms")
-
-    #     for i in range(n_iter):
-    #         _ = model.generate(**input_ids, do_sample=False)
-
-    # @slow
-    # @require_mindspore_gpu  # Testing cuda graphs.
-    # @require_read_token
-    # def test_compile_cuda_graph_time(self):
-    #     if version.parse(torch.__version__) < version.parse("2.3"):
-    #         self.skipTest(reason="This test requires torch >= 2.3 to run.")
-
-    #     # TODO felix: All models supporting `StaticCache` or `torch.compile` should be tested.
-    #     # At the moment, only llama, gemma and gemma2 are tested here!
-    #     if not hasattr(self, "_torch_compile_test_ckpt"):
-    #         self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
-    #     ckpt = self._torch_compile_test_ckpt
-
-    #     os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-    #     tokenizer = AutoTokenizer.from_pretrained(ckpt)
-    #     model = AutoModelForCausalLM.from_pretrained(ckpt, ms_dtype=mindspore.float16)
-
-    #     cache_implementation = "static"
-    #     if model.config.model_type == "gemma2":
-    #         cache_implementation = "hybrid"
-
-    #     new_tokens = 50
-    #     gen_config = GenerationConfig(
-    #         max_new_tokens=new_tokens,
-    #         min_new_tokens=new_tokens,
-    #         use_cache=True,
-    #         pad_token_id=tokenizer.pad_token_id,
-    #         num_beams=1,
-    #         do_sample=False,
-    #         eos_token_id=None,  # This is required for min_new_tokens to actually have an effect.
-    #     )
-    #     model.generation_config.eos_token_id = None  # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect.
-
-    #     model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
-
-    #     inp = tokenizer("Why cats are cute?", return_tensors="ms")
-
-    #     # First run: the first run warms up each graph, which does things like CuBlas or Triton benchmarking
-    #     start = time.perf_counter()
-    #     _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
-    #     end = time.perf_counter()
-    #     graph_warmup_time = end - start
-
-    #     # Second run: CUDA Graph recording, and replays it
-    #     start = time.perf_counter()
-    #     _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
-    #     end = time.perf_counter()
-    #     record_time = end - start
-
-    #     # Finally: we hit the optimized, CUDA Graph replay path
-    #     start = time.perf_counter()
-    #     _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
-    #     end = time.perf_counter()
-    #     opt_time = end - start
-
-    #     # For the recording step, we expect only two cuda graphs and this step should be much faster than the first.
-    #     self.assertTrue(record_time < 0.15 * graph_warmup_time)
-    #     self.assertTrue(opt_time < record_time)
-
-
-global_rng = random.Random()
-
-
-def ids_tensor(shape, vocab_size, rng=None, name=None):
-    #  Creates a random int32 tensor of the shape within the vocab size
-    if rng is None:
-        rng = global_rng
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.randint(0, vocab_size - 1))
-
-    return mindspore.tensor(values, dtype=mindspore.int64).view(tuple(shape))
-
-
-def random_attention_mask(shape, rng=None, name=None):
-    attn_mask = ids_tensor(shape, vocab_size=2, rng=None, name=None)
-    # make sure that at least one token is attended to for each batch
-    # we choose the 1st token so this property of `at least one being non-zero` still holds after applying causal mask
-    attn_mask[:, 0] = 1
-    return attn_mask
-
-
-def floats_tensor(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.random() * scale)
-
-    return mindspore.tensor(values, dtype=mindspore.float32).view(tuple(shape))
\ No newline at end of file
diff --git a/tests/transformers/test_pipeline_mixin.py b/tests/transformers/test_pipeline_mixin.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/transformers/test_sequence_feature_extraction_common.py b/tests/transformers/test_sequence_feature_extraction_common.py
deleted file mode 100644
index d4d5d8af0..000000000
--- a/tests/transformers/test_sequence_feature_extraction_common.py
+++ /dev/null
@@ -1,394 +0,0 @@
-# coding=utf-8
-# Copyright 2021 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import numpy as np
-
-from mindnlp.transformers.feature_extraction_sequence_utils import BatchFeature
-from mindnlp.utils.testing_utils import require_mindspore
-
-from .test_feature_extraction_common import FeatureExtractionSavingTestMixin
-
-
-class SequenceFeatureExtractionTestMixin(FeatureExtractionSavingTestMixin):
-    # to overwrite at feature extractactor specific tests
-    feat_extract_tester = None
-    feature_extraction_class = None
-
-    @property
-    def feat_extract_dict(self):
-        return self.feat_extract_tester.prepare_feat_extract_dict()
-
-    def test_feat_extract_common_properties(self):
-        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
-        self.assertTrue(hasattr(feat_extract, "feature_size"))
-        self.assertTrue(hasattr(feat_extract, "sampling_rate"))
-        self.assertTrue(hasattr(feat_extract, "padding_value"))
-
-    def test_batch_feature(self):
-        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
-        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
-        input_name = feat_extract.model_input_names[0]
-
-        processed_features = BatchFeature({input_name: speech_inputs})
-
-        self.assertTrue(all(len(x) == len(y) for x, y in zip(speech_inputs, processed_features[input_name])))
-
-        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
-        processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="np")
-
-        batch_features_input = processed_features[input_name]
-
-        if len(batch_features_input.shape) < 3:
-            batch_features_input = batch_features_input[:, :, None]
-
-        self.assertTrue(
-            batch_features_input.shape
-            == (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
-        )
-
-    @require_mindspore
-    def test_batch_feature_ms(self):
-        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
-        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
-        input_name = feat_extract.model_input_names[0]
-
-        processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="ms")
-
-        batch_features_input = processed_features[input_name]
-
-        if len(batch_features_input.shape) < 3:
-            batch_features_input = batch_features_input[:, :, None]
-
-        self.assertTrue(
-            batch_features_input.shape
-            == (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
-        )
-
-
-    def _check_padding(self, numpify=False):
-        def _inputs_have_equal_length(input):
-            length = len(input[0])
-            for input_slice in input[1:]:
-                if len(input_slice) != length:
-                    return False
-            return True
-
-        def _inputs_are_equal(input_1, input_2):
-            if len(input_1) != len(input_2):
-                return False
-
-            for input_slice_1, input_slice_2 in zip(input_1, input_2):
-                if not np.allclose(np.asarray(input_slice_1), np.asarray(input_slice_2), atol=1e-3):
-                    return False
-            return True
-
-        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
-        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(numpify=numpify)
-        input_name = feat_extract.model_input_names[0]
-
-        processed_features = BatchFeature({input_name: speech_inputs})
-
-        pad_diff = self.feat_extract_tester.seq_length_diff
-        pad_max_length = self.feat_extract_tester.max_seq_length + pad_diff
-        pad_min_length = self.feat_extract_tester.min_seq_length
-        batch_size = self.feat_extract_tester.batch_size
-        feature_size = self.feat_extract_tester.feature_size
-
-        # test padding for List[int] + numpy
-        input_1 = feat_extract.pad(processed_features, padding=False)
-        input_1 = input_1[input_name]
-
-        input_2 = feat_extract.pad(processed_features, padding="longest")
-        input_2 = input_2[input_name]
-
-        input_3 = feat_extract.pad(processed_features, padding="max_length", max_length=len(speech_inputs[-1]))
-        input_3 = input_3[input_name]
-
-        input_4 = feat_extract.pad(processed_features, padding="longest", return_tensors="np")
-        input_4 = input_4[input_name]
-
-        # max_length parameter has to be provided when setting `padding="max_length"`
-        with self.assertRaises(ValueError):
-            feat_extract.pad(processed_features, padding="max_length")[input_name]
-
-        input_5 = feat_extract.pad(
-            processed_features, padding="max_length", max_length=pad_max_length, return_tensors="np"
-        )
-        input_5 = input_5[input_name]
-
-        self.assertFalse(_inputs_have_equal_length(input_1))
-        self.assertTrue(_inputs_have_equal_length(input_2))
-        self.assertTrue(_inputs_have_equal_length(input_3))
-        self.assertTrue(_inputs_are_equal(input_2, input_3))
-        self.assertTrue(len(input_1[0]) == pad_min_length)
-        self.assertTrue(len(input_1[1]) == pad_min_length + pad_diff)
-        self.assertTrue(input_4.shape[:2] == (batch_size, len(input_3[0])))
-        self.assertTrue(input_5.shape[:2] == (batch_size, pad_max_length))
-
-        if feature_size > 1:
-            self.assertTrue(input_4.shape[2] == input_5.shape[2] == feature_size)
-
-        # test padding for `pad_to_multiple_of` for List[int] + numpy
-        input_6 = feat_extract.pad(processed_features, pad_to_multiple_of=10)
-        input_6 = input_6[input_name]
-
-        input_7 = feat_extract.pad(processed_features, padding="longest", pad_to_multiple_of=10)
-        input_7 = input_7[input_name]
-
-        input_8 = feat_extract.pad(
-            processed_features, padding="max_length", pad_to_multiple_of=10, max_length=pad_max_length
-        )
-        input_8 = input_8[input_name]
-
-        input_9 = feat_extract.pad(
-            processed_features,
-            padding="max_length",
-            pad_to_multiple_of=10,
-            max_length=pad_max_length,
-            return_tensors="np",
-        )
-        input_9 = input_9[input_name]
-
-        self.assertTrue(all(len(x) % 10 == 0 for x in input_6))
-        self.assertTrue(_inputs_are_equal(input_6, input_7))
-
-        expected_mult_pad_length = pad_max_length if pad_max_length % 10 == 0 else (pad_max_length // 10 + 1) * 10
-        self.assertTrue(all(len(x) == expected_mult_pad_length for x in input_8))
-        self.assertEqual(input_9.shape[:2], (batch_size, expected_mult_pad_length))
-
-        if feature_size > 1:
-            self.assertTrue(input_9.shape[2] == feature_size)
-
-        # Check padding value is correct
-        padding_vector_sum = (np.ones(self.feat_extract_tester.feature_size) * feat_extract.padding_value).sum()
-        self.assertTrue(
-            abs(np.asarray(input_2[0])[pad_min_length:].sum() - padding_vector_sum * (pad_max_length - pad_min_length))
-            < 1e-3
-        )
-        self.assertTrue(
-            abs(
-                np.asarray(input_2[1])[pad_min_length + pad_diff :].sum()
-                - padding_vector_sum * (pad_max_length - pad_min_length - pad_diff)
-            )
-            < 1e-3
-        )
-        self.assertTrue(
-            abs(
-                np.asarray(input_2[2])[pad_min_length + 2 * pad_diff :].sum()
-                - padding_vector_sum * (pad_max_length - pad_min_length - 2 * pad_diff)
-            )
-            < 1e-3
-        )
-        self.assertTrue(
-            abs(input_5[0, pad_min_length:].sum() - padding_vector_sum * (pad_max_length - pad_min_length)) < 1e-3
-        )
-        self.assertTrue(
-            abs(input_9[0, pad_min_length:].sum() - padding_vector_sum * (expected_mult_pad_length - pad_min_length))
-            < 1e-3
-        )
-
-    def _check_truncation(self, numpify=False):
-        def _inputs_have_equal_length(input):
-            length = len(input[0])
-            for input_slice in input[1:]:
-                if len(input_slice) != length:
-                    return False
-            return True
-
-        def _inputs_are_equal(input_1, input_2):
-            if len(input_1) != len(input_2):
-                return False
-
-            for input_slice_1, input_slice_2 in zip(input_1, input_2):
-                if not np.allclose(np.asarray(input_slice_1), np.asarray(input_slice_2), atol=1e-3):
-                    return False
-            return True
-
-        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
-        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(numpify=numpify)
-        input_name = feat_extract.model_input_names[0]
-
-        processed_features = BatchFeature({input_name: speech_inputs})
-
-        # truncate to smallest
-        input_1 = feat_extract.pad(
-            processed_features, padding="max_length", max_length=len(speech_inputs[0]), truncation=True
-        )
-        input_1 = input_1[input_name]
-
-        input_2 = feat_extract.pad(processed_features, padding="max_length", max_length=len(speech_inputs[0]))
-        input_2 = input_2[input_name]
-
-        self.assertTrue(_inputs_have_equal_length(input_1))
-        self.assertFalse(_inputs_have_equal_length(input_2))
-
-        # truncate to smallest with np
-        input_3 = feat_extract.pad(
-            processed_features,
-            padding="max_length",
-            max_length=len(speech_inputs[0]),
-            return_tensors="np",
-            truncation=True,
-        )
-        input_3 = input_3[input_name]
-
-        input_4 = feat_extract.pad(
-            processed_features, padding="max_length", max_length=len(speech_inputs[0]), return_tensors="np"
-        )
-        input_4 = input_4[input_name]
-
-        self.assertTrue(_inputs_have_equal_length(input_3))
-        self.assertTrue(input_3.shape[1] == len(speech_inputs[0]))
-
-        # since truncation forces padding to be smaller than longest input
-        # function can't return `np.ndarray`, but has to return list
-        self.assertFalse(_inputs_have_equal_length(input_4))
-
-        # truncate to middle
-        input_5 = feat_extract.pad(
-            processed_features,
-            padding="max_length",
-            max_length=len(speech_inputs[1]),
-            truncation=True,
-            return_tensors="np",
-        )
-        input_5 = input_5[input_name]
-
-        input_6 = feat_extract.pad(
-            processed_features, padding="max_length", max_length=len(speech_inputs[1]), truncation=True
-        )
-        input_6 = input_6[input_name]
-
-        input_7 = feat_extract.pad(
-            processed_features, padding="max_length", max_length=len(speech_inputs[1]), return_tensors="np"
-        )
-        input_7 = input_7[input_name]
-
-        self.assertTrue(input_5.shape[1] == len(speech_inputs[1]))
-        self.assertTrue(_inputs_have_equal_length(input_5))
-        self.assertTrue(_inputs_have_equal_length(input_6))
-        self.assertTrue(_inputs_are_equal(input_5, input_6))
-
-        # since truncation forces padding to be smaller than longest input
-        # function can't return `np.ndarray`, but has to return list
-        self.assertFalse(_inputs_have_equal_length(input_7))
-        self.assertTrue(len(input_7[-1]) == len(speech_inputs[-1]))
-
-        # padding has to be max_length when setting `truncation=True`
-        with self.assertRaises(ValueError):
-            feat_extract.pad(processed_features, truncation=True)[input_name]
-
-        # padding has to be max_length when setting `truncation=True`
-        with self.assertRaises(ValueError):
-            feat_extract.pad(processed_features, padding="longest", truncation=True)[input_name]
-
-        # padding has to be max_length when setting `truncation=True`
-        with self.assertRaises(ValueError):
-            feat_extract.pad(processed_features, padding="longest", truncation=True)[input_name]
-
-        # max_length parameter has to be provided when setting `truncation=True` and padding="max_length"
-        with self.assertRaises(ValueError):
-            feat_extract.pad(processed_features, padding="max_length", truncation=True)[input_name]
-
-        # test truncation for `pad_to_multiple_of` for List[int] + numpy
-        pad_to_multiple_of = 12
-        input_8 = feat_extract.pad(
-            processed_features,
-            padding="max_length",
-            max_length=len(speech_inputs[0]),
-            pad_to_multiple_of=pad_to_multiple_of,
-            truncation=True,
-        )
-        input_8 = input_8[input_name]
-
-        input_9 = feat_extract.pad(
-            processed_features,
-            padding="max_length",
-            max_length=len(speech_inputs[0]),
-            pad_to_multiple_of=pad_to_multiple_of,
-        )
-        input_9 = input_9[input_name]
-
-        # retrieve expected_length as multiple of pad_to_multiple_of
-        expected_length = len(speech_inputs[0])
-        if expected_length % pad_to_multiple_of != 0:
-            expected_length = ((len(speech_inputs[0]) // pad_to_multiple_of) + 1) * pad_to_multiple_of
-
-        self.assertTrue(len(input_8[0]) == expected_length)
-        self.assertTrue(_inputs_have_equal_length(input_8))
-        self.assertFalse(_inputs_have_equal_length(input_9))
-
-    def test_padding_from_list(self):
-        self._check_padding(numpify=False)
-
-    def test_padding_from_array(self):
-        self._check_padding(numpify=True)
-
-    def test_truncation_from_list(self):
-        self._check_truncation(numpify=False)
-
-    def test_truncation_from_array(self):
-        self._check_truncation(numpify=True)
-
-    @require_mindspore
-    def test_padding_accepts_tensors_ms(self):
-        feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
-        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
-        input_name = feat_extract.model_input_names[0]
-
-        processed_features = BatchFeature({input_name: speech_inputs})
-
-        input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
-        input_pt = feat_extract.pad(processed_features, padding="longest", return_tensors="ms")[input_name]
-
-        self.assertTrue(abs(input_np.astype(np.float32).sum() - input_pt.numpy().astype(np.float32).sum()) < 1e-2)
-
-    def test_attention_mask(self):
-        feat_dict = self.feat_extract_dict
-        feat_dict["return_attention_mask"] = True
-        feat_extract = self.feature_extraction_class(**feat_dict)
-        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
-        input_lengths = [len(x) for x in speech_inputs]
-        input_name = feat_extract.model_input_names[0]
-
-        processed = BatchFeature({input_name: speech_inputs})
-
-        processed = feat_extract.pad(processed, padding="longest", return_tensors="np")
-        self.assertIn("attention_mask", processed)
-        self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
-        self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lengths)
-
-    def test_attention_mask_with_truncation(self):
-        feat_dict = self.feat_extract_dict
-        feat_dict["return_attention_mask"] = True
-        feat_extract = self.feature_extraction_class(**feat_dict)
-        speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
-        input_lengths = [len(x) for x in speech_inputs]
-        input_name = feat_extract.model_input_names[0]
-
-        processed = BatchFeature({input_name: speech_inputs})
-        max_length = min(input_lengths)
-
-        processed_pad = feat_extract.pad(
-            processed, padding="max_length", max_length=max_length, truncation=True, return_tensors="np"
-        )
-        self.assertIn("attention_mask", processed_pad)
-        self.assertListEqual(
-            list(processed_pad.attention_mask.shape), [processed_pad[input_name].shape[0], max_length]
-        )
-        self.assertListEqual(
-            processed_pad.attention_mask[:, :max_length].sum(-1).tolist(), [max_length for x in speech_inputs]
-        )
diff --git a/tests/transformers/test_tokenization_common.py b/tests/transformers/test_tokenization_common.py
deleted file mode 100644
index e57fe9b37..000000000
--- a/tests/transformers/test_tokenization_common.py
+++ /dev/null
@@ -1,3973 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import inspect
-import itertools
-import json
-import os
-import pickle
-import re
-import shutil
-import tempfile
-import traceback
-import unittest
-from collections import OrderedDict
-from itertools import takewhile
-from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
-
-from parameterized import parameterized
-
-from mindnlp.transformers import (
-    AlbertTokenizer,
-    AlbertTokenizerFast,
-    BertTokenizer,
-    BertTokenizerFast,
-    PreTrainedTokenizer,
-    PreTrainedTokenizerBase,
-    PreTrainedTokenizerFast,
-    SpecialTokensMixin,
-    # Trainer,
-    # TrainingArguments,
-)
-from mindnlp.utils import logging, require_mindspore, is_mindspore_available
-from mindnlp.transformers.tokenization_utils import AddedToken
-from mindnlp.utils.testing_utils import get_tests_dir, check_json_file_has_correct_format, run_test_in_subprocess, slow
-
-if is_mindspore_available():
-    from mindspore import nn
-
-
-from mindnlp.transformers import PretrainedConfig, PreTrainedModel
-
-
-logger = logging.get_logger(__name__)
-
-NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
-
-SMALL_TRAINING_CORPUS = [
-    ["This is the first sentence.", "This is the second one."],
-    ["This sentence (contains #) over symbols and numbers 12 3.", "But not this one."],
-]
-
-
-def filter_non_english(_, pretrained_name: str):
-    """Filter all the model for non-english language"""
-    return not any(lang in pretrained_name for lang in NON_ENGLISH_TAGS)
-
-
-def filter_roberta_detectors(_, pretrained_name: str):
-    return "detector" not in pretrained_name
-
-
-def merge_model_tokenizer_mappings(
-    model_mapping: Dict["PretrainedConfig", "PreTrainedModel"],
-    tokenizer_mapping: Dict["PretrainedConfig", Tuple["PreTrainedTokenizer", "PreTrainedTokenizerFast"]],
-) -> Dict[
-    Union["PreTrainedTokenizer", "PreTrainedTokenizerFast"],
-    Tuple["PretrainedConfig", "PreTrainedModel"],
-]:
-    configurations = list(model_mapping.keys())
-    model_tokenizer_mapping = OrderedDict([])
-
-    for configuration in configurations:
-        if configuration in model_mapping and configuration in tokenizer_mapping:
-            model = model_mapping[configuration]
-            tokenizer = tokenizer_mapping[configuration][0]
-            tokenizer_fast = tokenizer_mapping[configuration][1]
-
-            if tokenizer is not None:
-                if configuration.__name__.startswith(tokenizer.__name__.replace("Tokenizer", "")):
-                    model_tokenizer_mapping.update({tokenizer: (configuration, model)})
-            if tokenizer_fast is not None:
-                if configuration.__name__.startswith(tokenizer_fast.__name__.replace("TokenizerFast", "")):
-                    model_tokenizer_mapping.update({tokenizer_fast: (configuration, model)})
-
-    return model_tokenizer_mapping
-
-
-def _test_subword_regularization_tokenizer(in_queue, out_queue, timeout):
-    error = None
-
-    try:
-        inputs = in_queue.get(timeout=timeout)
-        tokenizer = inputs["tokenizer"]
-        sp_model_kwargs = inputs["sp_model_kwargs"]
-        test_sentencepiece_ignore_case = inputs["test_sentencepiece_ignore_case"]
-
-        unittest.TestCase().assertTrue(hasattr(tokenizer, "sp_model_kwargs"))
-        unittest.TestCase().assertIsNotNone(tokenizer.sp_model_kwargs)
-        unittest.TestCase().assertTrue(isinstance(tokenizer.sp_model_kwargs, dict))
-        unittest.TestCase().assertDictEqual(tokenizer.sp_model_kwargs, sp_model_kwargs)
-        check_subword_sampling(tokenizer, test_sentencepiece_ignore_case=test_sentencepiece_ignore_case)
-
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
-def check_subword_sampling(
-    tokenizer: PreTrainedTokenizer,
-    text: str = None,
-    test_sentencepiece_ignore_case: bool = True,
-) -> None:
-    """
-    Check if the tokenizer generates different results when subword regularization is enabled.
-
-    Subword regularization augments training data with subword sampling.
-    This has a random component.
-
-    Args:
-        tokenizer: The tokenizer to check.
-        text: The text to use for the checks.
-        test_sentencepiece_ignore_case: See `TokenizerTesterMixin.test_sentencepiece_ignore_case`.
-    """
-    text = "This is a test for subword regularization." if text is None else text
-    if test_sentencepiece_ignore_case:
-        text = text.lower()
-
-    tokens_list = []
-    for _ in range(5):
-        tokens_list.append(tokenizer.tokenize(text))
-
-    # the list of different pairs of tokens_list
-    combinations = itertools.combinations(tokens_list, 2)
-
-    # check of sampling is done
-    subword_sampling_found = False
-    for combination in combinations:
-        if combination[0] != combination[1]:
-            subword_sampling_found = True
-    unittest.TestCase().assertTrue(subword_sampling_found)
-
-    # check if converting back to original text works
-    for tokens in tokens_list:
-        if test_sentencepiece_ignore_case:
-            unittest.TestCase().assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower())
-        else:
-            unittest.TestCase().assertEqual(text, tokenizer.convert_tokens_to_string(tokens))
-
-
-class TokenizerTesterMixin:
-    tokenizer_class = None
-    rust_tokenizer_class = None
-    test_slow_tokenizer = True
-    test_rust_tokenizer = True
-    space_between_special_tokens = False
-    from_pretrained_kwargs = None
-    from_pretrained_filter = None
-    from_pretrained_vocab_key = "vocab_file"
-    test_seq2seq = True
-
-    # set to True to test a sentencepiece tokenizer
-    test_sentencepiece = False
-
-    # set to True to ignore casing when testing a sentencepiece tokenizer
-    # test_sentencepiece must also be set to True
-    test_sentencepiece_ignore_case = False
-
-    def setUp(self) -> None:
-        # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
-        # information available in Tokenizer (name, rust class, python class, vocab key name)
-        if self.test_rust_tokenizer:
-            tokenizers_list = [
-                (
-                    self.rust_tokenizer_class,
-                    pretrained_name,
-                    self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {},
-                )
-                for pretrained_name in self.rust_tokenizer_class.pretrained_vocab_files_map[
-                    self.from_pretrained_vocab_key
-                ].keys()
-                if self.from_pretrained_filter is None
-                or (self.from_pretrained_filter is not None and self.from_pretrained_filter(pretrained_name))
-            ]
-            self.tokenizers_list = tokenizers_list[:1]  # Let's just test the first pretrained vocab for speed
-        else:
-            self.tokenizers_list = []
-        with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
-            self._data = f_data.read().replace("\n\n", "\n").strip()
-
-        self.tmpdirname = tempfile.mkdtemp()
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def get_input_output_texts(self, tokenizer):
-        input_txt = self.get_clean_sequence(tokenizer)[0]
-        return input_txt, input_txt
-
-    def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
-        # the length of the tokenizer does not always represent the tokens that it can encode: what if there are holes?
-        toks = [
-            (i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in set(tokenizer.get_vocab().values())
-        ]
-        toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
-        toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks))
-        if max_length is not None and len(toks) > max_length:
-            toks = toks[:max_length]
-        if min_length is not None and len(toks) < min_length and len(toks) > 0:
-            while len(toks) < min_length:
-                toks = toks + toks
-        # toks_str = [t[1] for t in toks]
-        toks_ids = [t[0] for t in toks]
-
-        # Ensure consistency
-        output_txt = tokenizer.decode(toks_ids, clean_up_tokenization_spaces=False)
-        if " " not in output_txt and len(toks_ids) > 1:
-            output_txt = (
-                tokenizer.decode([toks_ids[0]], clean_up_tokenization_spaces=False)
-                + " "
-                + tokenizer.decode(toks_ids[1:], clean_up_tokenization_spaces=False)
-            )
-        if with_prefix_space:
-            output_txt = " " + output_txt
-        output_ids = tokenizer.encode(output_txt, add_special_tokens=False)
-        return output_txt, output_ids
-
-    def get_tokenizers(self, fast=True, **kwargs) -> List[PreTrainedTokenizerBase]:
-        if fast and self.test_rust_tokenizer and self.test_slow_tokenizer:
-            return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
-        elif fast and self.test_rust_tokenizer:
-            return [self.get_rust_tokenizer(**kwargs)]
-        elif self.test_slow_tokenizer:
-            return [self.get_tokenizer(**kwargs)]
-        else:
-            raise ValueError("This tokenizer class has no tokenizer to be tested.")
-
-    def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
-        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
-
-    def tokenizer_integration_test_util(
-        self,
-        expected_encoding: Dict,
-        model_name: str,
-        revision: str = None,
-        sequences: List[str] = None,
-        decode_kwargs: Dict[str, Any] = None,
-        padding: bool = True,
-    ):
-        """
-        Util for integration test.
-
-        Text is tokenized and then reverted back to text. Both results are then checked.
-
-        Args:
-            expected_encoding:
-                The expected result of the tokenizer output.
-            model_name:
-                The model name of the tokenizer to load and use.
-            revision:
-                The full git revision number of the model. This is to pin the
-                tokenizer config and to avoid that tests start to fail if the
-                config gets changed upstream.
-            sequences:
-                Can overwrite the texts that are used to check the tokenizer.
-                This is useful if the tokenizer supports non english languages
-                like france.
-            decode_kwargs:
-                Additional args for the ``decode`` function which reverts the
-                tokenized text back to a string.
-            padding:
-                Activates and controls padding of the tokenizer.
-        """
-        decode_kwargs = {} if decode_kwargs is None else decode_kwargs
-
-        if sequences is None:
-            sequences = [
-                "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
-                "general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
-                "Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained "
-                "models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
-                "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
-                "conditioning on both left and right context in all layers.",
-                "The quick brown fox jumps over the lazy dog.",
-            ]
-
-        if self.test_sentencepiece_ignore_case:
-            sequences = [sequence.lower() for sequence in sequences]
-
-        tokenizer_classes = [self.tokenizer_class]
-        if self.test_rust_tokenizer:
-            tokenizer_classes.append(self.rust_tokenizer_class)
-
-        for tokenizer_class in tokenizer_classes:
-            tokenizer = tokenizer_class.from_pretrained(
-                model_name,
-                revision=revision,  # to pin the tokenizer version
-            )
-
-            encoding = tokenizer(sequences, padding=padding)
-            decoded_sequences = [
-                tokenizer.decode(seq, skip_special_tokens=True, **decode_kwargs) for seq in encoding["input_ids"]
-            ]
-
-            encoding_data = encoding.data
-            self.assertDictEqual(encoding_data, expected_encoding)
-
-            for expected, decoded in zip(sequences, decoded_sequences):
-                if self.test_sentencepiece_ignore_case:
-                    expected = expected.lower()
-                self.assertEqual(expected, decoded)
-
-    def assert_padded_input_match(self, input_r: list, input_p: list, max_length: int, pad_token_id: int):
-        # Ensure we match max_length
-        self.assertEqual(len(input_r), max_length)
-        self.assertEqual(len(input_p), max_length)
-
-        # Ensure the number of padded tokens is the same
-        padded_tokens_r = list(takewhile(lambda i: i == pad_token_id, reversed(input_r)))
-        padded_tokens_p = list(takewhile(lambda i: i == pad_token_id, reversed(input_p)))
-        self.assertSequenceEqual(padded_tokens_r, padded_tokens_p)
-
-    def assert_batch_padded_input_match(
-        self,
-        input_r: dict,
-        input_p: dict,
-        max_length: int,
-        pad_token_id: int,
-        model_main_input_name: str = "input_ids",
-    ):
-        for i_r in input_r.values():
-            self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
-                len(i_r[1]), max_length
-            )
-            self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
-                len(i_r[1]), max_length
-            )
-
-        for i_r, i_p in zip(input_r[model_main_input_name], input_p[model_main_input_name]):
-            self.assert_padded_input_match(i_r, i_p, max_length, pad_token_id)
-
-        for i_r, i_p in zip(input_r["attention_mask"], input_p["attention_mask"]):
-            self.assertSequenceEqual(i_r, i_p)
-
-    @staticmethod
-    def convert_batch_encode_plus_format_to_encode_plus(batch_encode_plus_sequences):
-        # Switch from batch_encode_plus format:   {'input_ids': [[...], [...]], ...}
-        # to the list of examples/ encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}]
-        return [
-            {value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences.keys()}
-            for i in range(len(batch_encode_plus_sequences["input_ids"]))
-        ]
-
-    # TODO: this test can be combined with `test_sentencepiece_tokenize_and_convert_tokens_to_string` after the latter is extended to all tokenizers.
-    def test_tokenize_special_tokens(self):
-        """Test `tokenize` with special tokens."""
-        tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]"
-                SPECIAL_TOKEN_2 = "[SPECIAL_TOKEN_2]"
-
-                # Both methods should add the token to `_additional_special_tokens` and `added_tokens_decoder`
-                tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
-                tokenizer.add_special_tokens(
-                    {"additional_special_tokens": [SPECIAL_TOKEN_2]}, replace_additional_special_tokens=False
-                )
-
-                token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
-                token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2)
-
-                self.assertEqual(len(token_1), 1)
-                self.assertEqual(len(token_2), 1)
-                self.assertEqual(token_1[0], SPECIAL_TOKEN_1)
-                # next is failing for almost all the Fast tokenizers now.
-                # self.assertEqual(token_2[0], SPECIAL_TOKEN_2)
-
-    # TODO: this test could be extended to all tokenizers - not just the sentencepiece
-    def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
-        """Test ``_tokenize`` and ``convert_tokens_to_string``."""
-        if not self.test_sentencepiece:
-            return
-
-        tokenizer = self.get_tokenizer()
-        text = "This is text to test the tokenizer."
-
-        if self.test_sentencepiece_ignore_case:
-            text = text.lower()
-
-        tokens = tokenizer.tokenize(text)
-
-        self.assertTrue(len(tokens) > 0)
-
-        # check if converting back to original text works
-        reverse_text = tokenizer.convert_tokens_to_string(tokens)
-
-        if self.test_sentencepiece_ignore_case:
-            reverse_text = reverse_text.lower()
-
-        self.assertEqual(reverse_text, text)
-
-        special_tokens = tokenizer.all_special_tokens
-        special_tokens_string = tokenizer.convert_tokens_to_string(special_tokens)
-        for special_token in special_tokens:
-            self.assertIn(special_token, special_tokens_string)
-
-        if self.test_rust_tokenizer:
-            rust_tokenizer = self.get_rust_tokenizer()
-            special_tokens_string_rust = rust_tokenizer.convert_tokens_to_string(special_tokens)
-            self.assertEqual(special_tokens_string, special_tokens_string_rust)
-
-    def test_sentencepiece_tokenize_and_decode(self):
-        if not self.test_sentencepiece:
-            return
-
-        text = "This is text to test the tokenizer."
-        if self.test_rust_tokenizer:
-            tokenizer = self.get_tokenizer()
-            rust_tokenizer = self.get_rust_tokenizer()
-
-            slow_ids = tokenizer(text).input_ids
-            fast_ids = rust_tokenizer(text).input_ids
-            self.assertEqual(slow_ids, fast_ids)
-
-            slow_decoded = tokenizer.decode(slow_ids)
-            fast_decoded = rust_tokenizer.decode(slow_ids)
-            self.assertEqual(slow_decoded, fast_decoded)
-
-    def test_subword_regularization_tokenizer(self) -> None:
-        if not self.test_sentencepiece:
-            return
-
-        # Subword regularization is only available for the slow tokenizer.
-        sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
-        tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
-
-        run_test_in_subprocess(
-            test_case=self,
-            target_func=_test_subword_regularization_tokenizer,
-            inputs={
-                "tokenizer": tokenizer,
-                "sp_model_kwargs": sp_model_kwargs,
-                "test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case,
-            },
-        )
-
-    def test_pickle_subword_regularization_tokenizer(self) -> None:
-        if not self.test_sentencepiece:
-            return
-
-        """Google pickle __getstate__ __setstate__ if you are struggling with this."""
-        # Subword regularization is only available for the slow tokenizer.
-        sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
-        tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
-        tokenizer_bin = pickle.dumps(tokenizer)
-        del tokenizer
-        tokenizer_new = pickle.loads(tokenizer_bin)
-
-        run_test_in_subprocess(
-            test_case=self,
-            target_func=_test_subword_regularization_tokenizer,
-            inputs={
-                "tokenizer": tokenizer_new,
-                "sp_model_kwargs": sp_model_kwargs,
-                "test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case,
-            },
-        )
-
-    def test_save_sentencepiece_tokenizer(self) -> None:
-        if not self.test_sentencepiece or not self.test_slow_tokenizer:
-            return
-        # We want to verify that we will be able to save the tokenizer even if the original files that were used to
-        # build the tokenizer have been deleted in the meantime.
-        text = "This is text to test the tokenizer."
-
-        tokenizer_slow_1 = self.get_tokenizer()
-        encoding_tokenizer_slow_1 = tokenizer_slow_1(text)
-
-        tmpdirname_1 = tempfile.mkdtemp()
-        tmpdirname_2 = tempfile.mkdtemp()
-
-        tokenizer_slow_1.save_pretrained(tmpdirname_1)
-        tokenizer_slow_2 = self.tokenizer_class.from_pretrained(tmpdirname_1)
-        encoding_tokenizer_slow_2 = tokenizer_slow_2(text)
-
-        shutil.rmtree(tmpdirname_1)
-        tokenizer_slow_2.save_pretrained(tmpdirname_2)
-
-        tokenizer_slow_3 = self.tokenizer_class.from_pretrained(tmpdirname_2)
-        encoding_tokenizer_slow_3 = tokenizer_slow_3(text)
-        shutil.rmtree(tmpdirname_2)
-
-        self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_2)
-        self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_3)
-
-    def test_model_input_names_signature(self):
-        accepted_model_main_input_names = [
-            "input_ids",  # nlp models
-            "input_values",  # speech models
-        ]
-
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            # first name of model_input_names has to correspond to main model input name
-            # to make sure `tokenizer.pad(...)` works correctly
-            self.assertTrue(tokenizer.model_input_names[0] in accepted_model_main_input_names)
-
-    def test_rust_tokenizer_signature(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        signature = inspect.signature(self.rust_tokenizer_class.__init__)
-
-        self.assertIn("tokenizer_file", signature.parameters)
-        self.assertIsNone(signature.parameters["tokenizer_file"].default)
-
-    def test_tokenizer_slow_store_full_signature(self):
-        if not self.test_slow_tokenizer:
-            return
-
-        signature = inspect.signature(self.tokenizer_class.__init__)
-        tokenizer = self.get_tokenizer()
-
-        for parameter_name, parameter in signature.parameters.items():
-            if parameter.default != inspect.Parameter.empty:
-                self.assertIn(parameter_name, tokenizer.init_kwargs)
-
-    def test_tokenizer_fast_store_full_signature(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        signature = inspect.signature(self.rust_tokenizer_class.__init__)
-        tokenizer = self.get_rust_tokenizer()
-
-        for parameter_name, parameter in signature.parameters.items():
-            if parameter.default != inspect.Parameter.empty and parameter_name not in [
-                "vocab_file",
-                "merges_file",
-                "tokenizer_file",
-            ]:
-                self.assertIn(parameter_name, tokenizer.init_kwargs)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence, _ = self.get_input_output_texts(tokenizer)
-
-        # We don't have an exact equivalence on `tokenize()` between Rust and Slow
-        # Slow tokenizer only split tokens, Rust tokenizers will replace with <unk>
-        # tokens = tokenizer.tokenize(sequence)
-        # rust_tokens = rust_tokenizer.tokenize(sequence)
-        # self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=True)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=True)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_tokenizers_common_properties(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                attributes_list = [
-                    "bos_token",
-                    "eos_token",
-                    "unk_token",
-                    "sep_token",
-                    "pad_token",
-                    "cls_token",
-                    "mask_token",
-                ]
-                for attr in attributes_list:
-                    self.assertTrue(hasattr(tokenizer, attr))
-                    self.assertTrue(hasattr(tokenizer, attr + "_id"))
-
-                self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
-                self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids"))
-
-                attributes_list = [
-                    "model_max_length",
-                    "init_inputs",
-                    "init_kwargs",
-                ]
-                if not isinstance(tokenizer, PreTrainedTokenizerFast):
-                    attributes_list += [
-                        "added_tokens_encoder",
-                        "added_tokens_decoder",
-                    ]
-                for attr in attributes_list:
-                    self.assertTrue(hasattr(tokenizer, attr))
-
-    def test_tokenizers_common_ids_setters(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                attributes_list = [
-                    "bos_token",
-                    "eos_token",
-                    "unk_token",
-                    "sep_token",
-                    "pad_token",
-                    "cls_token",
-                    "mask_token",
-                ]
-
-                vocab = tokenizer.get_vocab()
-                token_id_to_test_setters = next(iter(vocab.values()))
-                token_to_test_setters = tokenizer.convert_ids_to_tokens(
-                    token_id_to_test_setters, skip_special_tokens=False
-                )
-
-                for attr in attributes_list:
-                    setattr(tokenizer, attr + "_id", None)
-                    self.assertEqual(getattr(tokenizer, attr), None)
-                    self.assertEqual(getattr(tokenizer, attr + "_id"), None)
-
-                    setattr(tokenizer, attr + "_id", token_id_to_test_setters)
-                    self.assertEqual(getattr(tokenizer, attr), token_to_test_setters)
-                    self.assertEqual(getattr(tokenizer, attr + "_id"), token_id_to_test_setters)
-
-                setattr(tokenizer, "additional_special_tokens_ids", [])
-                self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [])
-                self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [])
-
-                setattr(tokenizer, "additional_special_tokens_ids", [token_id_to_test_setters])
-                self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [token_to_test_setters])
-                self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [token_id_to_test_setters])
-
-    @parameterized.expand([(True,), (False,)])
-    def test_tokenizers_special_tokens_properties_unset(self, verbose):
-        tokenizers = self.get_tokenizers(verbose=verbose)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                attributes_list = [
-                    "bos_token",
-                    "eos_token",
-                    "unk_token",
-                    "sep_token",
-                    "pad_token",
-                    "cls_token",
-                    "mask_token",
-                    "additional_special_tokens",
-                ]
-                for attr in attributes_list:
-                    setattr(tokenizer, attr, None)
-                    self.assertIsNone(getattr(tokenizer, attr))
-
-    def test_save_and_load_tokenizer(self):
-        # safety check on max_len default value so we are sure the test works
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                self.assertNotEqual(tokenizer.model_max_length, 42)
-
-        # Now let's start the test
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Isolate this from the other tests because we save additional tokens/etc
-                tmpdirname = tempfile.mkdtemp()
-
-                sample_text = " He is very happy, UNwant\u00E9d,running"
-                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
-                before_vocab = tokenizer.get_vocab()
-                tokenizer.save_pretrained(tmpdirname)
-
-                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
-                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
-                after_vocab = after_tokenizer.get_vocab()
-                self.assertListEqual(before_tokens, after_tokens)
-                self.assertDictEqual(before_vocab, after_vocab)
-
-                shutil.rmtree(tmpdirname)
-
-        tokenizers = self.get_tokenizers(model_max_length=42)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Isolate this from the other tests because we save additional tokens/etc
-                tmpdirname = tempfile.mkdtemp()
-
-                sample_text = " He is very happy, UNwant\u00E9d,running"
-                tokenizer.add_tokens(["bim", "bambam"])
-                additional_special_tokens = tokenizer.additional_special_tokens
-                additional_special_tokens.append("new_additional_special_token")
-                tokenizer.add_special_tokens(
-                    {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
-                )
-                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
-                before_vocab = tokenizer.get_vocab()
-                tokenizer.save_pretrained(tmpdirname)
-
-                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
-                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
-                after_vocab = after_tokenizer.get_vocab()
-                self.assertListEqual(before_tokens, after_tokens)
-
-                self.assertDictEqual(before_vocab, after_vocab)
-                self.assertIn("bim", after_vocab)
-                self.assertIn("bambam", after_vocab)
-                self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens)
-                self.assertEqual(after_tokenizer.model_max_length, 42)
-
-                tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43)
-                self.assertEqual(tokenizer.model_max_length, 43)
-
-                shutil.rmtree(tmpdirname)
-
-        # Test that we can also use the non-legacy saving format for fast tokenizers
-        tokenizers = self.get_tokenizers(model_max_length=42)
-        for tokenizer in tokenizers:
-            if not tokenizer.is_fast:
-                continue
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Isolate this from the other tests because we save additional tokens/etc
-                tmpdirname = tempfile.mkdtemp()
-
-                sample_text = " He is very happy, UNwant\u00E9d,running"
-                tokenizer.add_tokens(["bim", "bambam"])
-                additional_special_tokens = tokenizer.additional_special_tokens
-                additional_special_tokens.append("new_additional_special_token")
-                tokenizer.add_special_tokens(
-                    {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
-                )
-                before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
-                before_vocab = tokenizer.get_vocab()
-                tokenizer.save_pretrained(tmpdirname)
-
-                after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
-                after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
-                after_vocab = after_tokenizer.get_vocab()
-                self.assertListEqual(before_tokens, after_tokens)
-                self.assertDictEqual(before_vocab, after_vocab)
-                self.assertIn("bim", after_vocab)
-                self.assertIn("bambam", after_vocab)
-                self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens)
-                self.assertEqual(after_tokenizer.model_max_length, 42)
-
-                tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43)
-                self.assertEqual(tokenizer.model_max_length, 43)
-
-                shutil.rmtree(tmpdirname)
-
-    def test_pickle_tokenizer(self):
-        """Google pickle __getstate__ __setstate__ if you are struggling with this."""
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                self.assertIsNotNone(tokenizer)
-
-                text = "Munich and Berlin are nice cities"
-                subwords = tokenizer.tokenize(text)
-
-                filename = os.path.join(self.tmpdirname, "tokenizer.bin")
-                with open(filename, "wb") as handle:
-                    pickle.dump(tokenizer, handle)
-
-                with open(filename, "rb") as handle:
-                    tokenizer_new = pickle.load(handle)
-
-                subwords_loaded = tokenizer_new.tokenize(text)
-
-                self.assertListEqual(subwords, subwords_loaded)
-
-    def test_pickle_added_tokens(self):
-        tok1 = AddedToken("<s>", rstrip=True, lstrip=True, normalized=False, single_word=True)
-        tok2 = pickle.loads(pickle.dumps(tok1))
-
-        self.assertEqual(tok1.__getstate__(), tok2.__getstate__())
-
-    def test_added_tokens_do_lower_case(self):
-        tokenizers = self.get_tokenizers(do_lower_case=True)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if not hasattr(tokenizer, "do_lower_case") or not tokenizer.do_lower_case:
-                    continue
-
-                special_token = tokenizer.all_special_tokens[0]
-
-                text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
-                text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
-
-                toks_before_adding = tokenizer.tokenize(text)  # toks before adding new_toks
-
-                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
-                added = tokenizer.add_tokens([AddedToken(tok, lstrip=True, rstrip=True) for tok in new_toks])
-
-                toks_after_adding = tokenizer.tokenize(text)
-                toks_after_adding2 = tokenizer.tokenize(text2)
-
-                # Rust tokenizers dont't lowercase added tokens at the time calling `tokenizer.add_tokens`,
-                # while python tokenizers do, so new_toks 0 and 2 would be treated as the same, so do new_toks 1 and 3.
-                self.assertIn(added, [2, 4])
-
-                self.assertListEqual(toks_after_adding, toks_after_adding2)
-                self.assertTrue(
-                    len(toks_before_adding) > len(toks_after_adding),  # toks_before_adding should be longer
-                )
-
-                # Check that none of the special tokens are lowercased
-                sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B"
-                # Convert the tokenized list to str as some special tokens are tokenized like normal tokens
-                # which have a prefix spacee e.g. the mask token of Albert, and cannot match the original
-                # special tokens exactly.
-                tokenized_sequence = "".join(tokenizer.tokenize(sequence_with_special_tokens))
-
-                for special_token in tokenizer.all_special_tokens:
-                    self.assertTrue(special_token in tokenized_sequence or special_token.lower() in tokenized_sequence)
-
-        tokenizers = self.get_tokenizers(do_lower_case=True)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if hasattr(tokenizer, "do_lower_case") and tokenizer.do_lower_case:
-                    continue
-
-                special_token = tokenizer.all_special_tokens[0]
-
-                text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
-                text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
-
-                toks_before_adding = tokenizer.tokenize(text)  # toks before adding new_toks
-
-                new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
-                added = tokenizer.add_tokens([AddedToken(tok, lstrip=True, rstrip=True) for tok in new_toks])
-                self.assertIn(added, [2, 4])
-
-                toks_after_adding = tokenizer.tokenize(text)
-                toks_after_adding2 = tokenizer.tokenize(text2)
-
-                self.assertEqual(len(toks_after_adding), len(toks_after_adding2))  # Length should still be the same
-                self.assertNotEqual(
-                    toks_after_adding[1], toks_after_adding2[1]
-                )  # But at least the first non-special tokens should differ
-                self.assertTrue(
-                    len(toks_before_adding) > len(toks_after_adding),  # toks_before_adding should be longer
-                )
-
-    # TODO @ArthurZ Nuke this
-    def test_add_tokens_tokenizer(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                vocab_size = tokenizer.vocab_size
-                all_size = len(tokenizer)
-
-                self.assertNotEqual(vocab_size, 0)
-
-                # We usually have added tokens from the start in tests (but also otherwise) because our vocab fixtures are
-                # smaller than the original vocabs - let's not assert this
-                # self.assertEqual(vocab_size, all_size)
-
-                new_toks = [
-                    AddedToken("aaaaa bbbbbb", rstrip=True, lstrip=True),
-                    AddedToken("cccccccccdddddddd", rstrip=True, lstrip=True),
-                ]
-                added_toks = tokenizer.add_tokens(new_toks)
-                vocab_size_2 = tokenizer.vocab_size
-                all_size_2 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_2, 0)
-                self.assertEqual(vocab_size, vocab_size_2)
-                self.assertEqual(added_toks, len(new_toks))
-                self.assertEqual(all_size_2, all_size + len(new_toks))
-
-                tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
-
-                self.assertGreaterEqual(len(tokens), 4)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-
-                new_toks_2 = {
-                    "eos_token": AddedToken(">>>>|||<||<<|<<", rstrip=True, lstrip=True),
-                    "pad_token": AddedToken("<<<<<|||>|>>>>|>", rstrip=True, lstrip=True),
-                }
-                added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-                vocab_size_3 = tokenizer.vocab_size
-                all_size_3 = len(tokenizer)
-
-                self.assertNotEqual(vocab_size_3, 0)
-                self.assertEqual(vocab_size, vocab_size_3)
-                self.assertEqual(added_toks_2, len(new_toks_2))
-                self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-
-                tokens = tokenizer.encode(
-                    ">>>>|||<||<<|<< aaaaa bbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
-                )
-
-                self.assertGreaterEqual(len(tokens), 6)
-                self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[0], tokens[1])
-
-                self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-                self.assertGreater(tokens[-2], tokens[-3])
-                self.assertEqual(tokens[0], tokenizer.eos_token_id)
-                self.assertEqual(tokens[-2], tokenizer.pad_token_id)
-
-    def test_add_special_tokens(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                input_text, ids = self.get_clean_sequence(tokenizer)
-
-                special_token = AddedToken("[SPECIAL_TOKEN]", lstrip=True, rstrip=True)
-
-                tokenizer.add_special_tokens({"cls_token": special_token})
-                special_token = str(special_token)
-                encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
-                self.assertEqual(len(encoded_special_token), 1)
-
-                text = tokenizer.decode(ids + encoded_special_token, clean_up_tokenization_spaces=False)
-                encoded = tokenizer.encode(text, add_special_tokens=False)
-
-                input_encoded = tokenizer.encode(input_text, add_special_tokens=False)
-                special_token_id = tokenizer.encode(special_token, add_special_tokens=False)
-                self.assertEqual(encoded, input_encoded + special_token_id)
-
-                decoded = tokenizer.decode(encoded, skip_special_tokens=True)
-                self.assertTrue(special_token not in decoded)
-
-    def test_internal_consistency(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                input_text, output_text = self.get_input_output_texts(tokenizer)
-
-                tokens = tokenizer.tokenize(input_text)
-                ids = tokenizer.convert_tokens_to_ids(tokens)
-                ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
-                self.assertListEqual(ids, ids_2)
-
-                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
-                self.assertNotEqual(len(tokens_2), 0)
-                text_2 = tokenizer.decode(ids)
-                self.assertIsInstance(text_2, str)
-
-                self.assertEqual(text_2, output_text)
-
-    def test_encode_decode_with_spaces(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False, fast=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                new_toks = [
-                    # These are added tokens, they will be normalized....
-                    AddedToken("[ABC]", normalized=True, lstrip=True, rstrip=True),
-                    AddedToken("[DEF]", normalized=True, lstrip=True, rstrip=True),
-                    AddedToken("GHI IHG", normalized=True, lstrip=True, rstrip=True),
-                ]
-                tokenizer.add_tokens(new_toks)
-                tokenizer.add_tokens([AddedToken("[SAMPLE]", normalized=True)], special_tokens=True)
-                input = "[ABC][DEF][ABC]GHI IHG[DEF]"
-                if self.space_between_special_tokens:
-                    output = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
-                else:
-                    output = input
-                encoded = tokenizer.encode(input, add_special_tokens=False)
-                decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
-
-                self.assertIn(decoded, [output, output.lower()])
-                return
-
-    def test_pretrained_model_lists(self):
-        # We should have at least one default checkpoint for each tokenizer
-        # We should specify the max input length as well (used in some part to list the pretrained checkpoints)
-        self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
-        self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
-        self.assertEqual(
-            len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]),
-            len(self.tokenizer_class.max_model_input_sizes),
-        )
-
-        weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
-        weights_lists_2 = []
-        for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
-            weights_lists_2.append(list(map_list.keys()))
-
-        for weights_list_2 in weights_lists_2:
-            self.assertListEqual(weights_list, weights_list_2)
-
-    def test_mask_output(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if (
-                    tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
-                    and "token_type_ids" in tokenizer.model_input_names
-                ):
-                    seq_0 = "Test this method."
-                    seq_1 = "With these inputs."
-                    information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
-                    sequences, mask = information["input_ids"], information["token_type_ids"]
-                    self.assertEqual(len(sequences), len(mask))
-
-    def test_token_type_ids(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                seq_0 = "Test this method."
-
-                # We want to have sequence 0 and sequence 1 are tagged
-                # respectively with 0 and 1 token_ids
-                # (regardless of whether the model use token type ids)
-                # We use this assumption in the QA pipeline among other place
-                output = tokenizer(seq_0, return_token_type_ids=True)
-                self.assertIn(0, output["token_type_ids"])
-
-    def test_sequence_ids(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            if not tokenizer.is_fast:
-                continue
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                seq_0 = "Test this method."
-                seq_1 = "With these inputs."
-
-                # We want to have sequence 0 and sequence 1 are tagged
-                # respectively with 0 and 1 token_ids
-                # (regardless of whether the model use token type ids)
-                # We use this assumption in the QA pipeline among other place
-                output = tokenizer(seq_0)
-                self.assertIn(0, output.sequence_ids())
-
-                output = tokenizer(seq_0, seq_1)
-                self.assertIn(0, output.sequence_ids())
-                self.assertIn(1, output.sequence_ids())
-
-                if tokenizer.num_special_tokens_to_add(pair=True):
-                    self.assertIn(None, output.sequence_ids())
-
-    def test_chat_template(self):
-        dummy_template = "{% for message in messages %}{{message['role'] + message['content']}}{% endfor %}"
-        dummy_conversation = [
-            {"role": "system", "content": "system message"},
-            {"role": "user", "content": "user message"},
-            {"role": "assistant", "content": "assistant message"},
-        ]
-        expected_output = "systemsystem messageuseruser messageassistantassistant message"
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                output = tokenizer.apply_chat_template(
-                    dummy_conversation, chat_template=dummy_template, tokenize=False
-                )
-                self.assertEqual(output, expected_output)  # Test we can pass chat_template arg
-                # Check that no error raised when tokenize=True
-                tokenizer.apply_chat_template(dummy_conversation, chat_template=dummy_template, tokenize=True)
-
-                tokenizer.chat_template = dummy_template
-                self.assertEqual(tokenizer.chat_template, dummy_template)  # Test property setter
-                output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False)
-                self.assertEqual(output, expected_output)  # Test chat_template attribute is used if no arg is passed
-                tokenizer.apply_chat_template(dummy_conversation, tokenize=True)  # Check that no error raised
-
-                with tempfile.TemporaryDirectory() as tmp_dir_name:
-                    tokenizer.save_pretrained(tmp_dir_name)
-                    tokenizer = tokenizer.from_pretrained(tmp_dir_name)
-
-                self.assertEqual(tokenizer.chat_template, dummy_template)  # Test template has persisted
-                output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False)
-                self.assertEqual(output, expected_output)  # Test output is the same after reloading
-                tokenizer.apply_chat_template(dummy_conversation, tokenize=True)  # Check that no error raised
-
-    def test_number_of_added_tokens(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                seq_0 = "Test this method."
-                seq_1 = "With these inputs."
-
-                sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)
-                attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
-
-                # Method is implemented (e.g. not GPT-2)
-                if len(attached_sequences) != 2:
-                    self.assertEqual(
-                        tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
-                    )
-
-    def test_maximum_encoding_length_single_input(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                seq_0, ids = self.get_clean_sequence(tokenizer, max_length=20)
-
-                sequence = tokenizer.encode(seq_0, add_special_tokens=False)
-                total_length = len(sequence)
-
-                self.assertGreater(
-                    total_length, 4, "Issue with the testing sequence, please update it, it's too short"
-                )
-
-                # Test with max model input length
-                model_max_length = tokenizer.model_max_length
-                self.assertEqual(model_max_length, 100)
-                seq_1 = seq_0 * model_max_length
-
-                sequence1 = tokenizer(seq_1, add_special_tokens=False)
-                total_length1 = len(sequence1["input_ids"])
-                self.assertGreater(
-                    total_length1,
-                    model_max_length,
-                    "Issue with the testing sequence, please update it, it's too short",
-                )
-
-                # Simple
-                padding_strategies = (
-                    [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
-                )
-                for padding_state in padding_strategies:
-                    with self.subTest(f"Padding: {padding_state}"):
-                        for truncation_state in [True, "longest_first", "only_first"]:
-                            with self.subTest(f"Truncation: {truncation_state}"):
-                                output = tokenizer(seq_1, padding=padding_state, truncation=truncation_state)
-                                self.assertEqual(len(output["input_ids"]), model_max_length)
-
-                                output = tokenizer([seq_1], padding=padding_state, truncation=truncation_state)
-                                self.assertEqual(len(output["input_ids"][0]), model_max_length)
-
-                        # Simple with no truncation
-                        # Reset warnings
-                        tokenizer.deprecation_warnings = {}
-                        with self.assertLogs("mindnlp.transformers", level="WARNING") as cm:
-                            output = tokenizer(seq_1, padding=padding_state, truncation=False)
-                            self.assertNotEqual(len(output["input_ids"]), model_max_length)
-                        self.assertEqual(len(cm.records), 1)
-                        self.assertTrue(
-                            cm.records[0].message.startswith(
-                                "Token indices sequence length is longer than the specified maximum sequence length"
-                                " for this model"
-                            )
-                        )
-
-                        tokenizer.deprecation_warnings = {}
-                        with self.assertLogs("mindnlp.transformers", level="WARNING") as cm:
-                            output = tokenizer([seq_1], padding=padding_state, truncation=False)
-                            self.assertNotEqual(len(output["input_ids"][0]), model_max_length)
-                        self.assertEqual(len(cm.records), 1)
-                        self.assertTrue(
-                            cm.records[0].message.startswith(
-                                "Token indices sequence length is longer than the specified maximum sequence length"
-                                " for this model"
-                            )
-                        )
-
-                # Overflowing tokens
-                stride = 2
-                information = tokenizer(
-                    seq_0,
-                    max_length=total_length - 2,
-                    add_special_tokens=False,
-                    stride=stride,
-                    truncation="longest_first",
-                    return_overflowing_tokens=True,
-                    # add_prefix_space=False,
-                )
-
-                # Overflowing tokens are handled quite differently in slow and fast tokenizers
-                if isinstance(tokenizer, PreTrainedTokenizerFast):
-                    truncated_sequence = information["input_ids"][0]
-                    overflowing_tokens = information["input_ids"][1]
-                    self.assertEqual(len(information["input_ids"]), 2)
-
-                    self.assertEqual(len(truncated_sequence), total_length - 2)
-                    self.assertEqual(truncated_sequence, sequence[:-2])
-
-                    self.assertEqual(len(overflowing_tokens), 2 + stride)
-                    self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
-                else:
-                    truncated_sequence = information["input_ids"]
-                    overflowing_tokens = information["overflowing_tokens"]
-
-                    self.assertEqual(len(truncated_sequence), total_length - 2)
-                    self.assertEqual(truncated_sequence, sequence[:-2])
-
-                    self.assertEqual(len(overflowing_tokens), 2 + stride)
-                    self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
-
-    def test_maximum_encoding_length_pair_input(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Build a sequence from our model's vocabulary
-                stride = 2
-                seq_0, ids = self.get_clean_sequence(tokenizer, max_length=20)
-                if len(ids) <= 2 + stride:
-                    seq_0 = (seq_0 + " ") * (2 + stride)
-                    ids = None
-
-                seq0_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
-                self.assertGreater(len(seq0_tokens), 2 + stride)
-
-                seq_1 = "This is another sentence to be encoded."
-                seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
-                if abs(len(seq0_tokens) - len(seq1_tokens)) <= 2:
-                    seq1_tokens = seq1_tokens + seq1_tokens
-                    seq_1 = tokenizer.decode(seq1_tokens, clean_up_tokenization_spaces=False)
-                seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
-
-                self.assertGreater(len(seq1_tokens), 2 + stride)
-
-                smallest = seq1_tokens if len(seq0_tokens) > len(seq1_tokens) else seq0_tokens
-
-                # We are not using the special tokens - a bit too hard to test all the tokenizers with this
-                # TODO try this again later
-                sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)  # , add_prefix_space=False)
-
-                # Test with max model input length
-                model_max_length = tokenizer.model_max_length
-                self.assertEqual(model_max_length, 100)
-                seq_2 = seq_0 * model_max_length
-                self.assertGreater(len(seq_2), model_max_length)
-
-                sequence1 = tokenizer(seq_1, add_special_tokens=False)
-                total_length1 = len(sequence1["input_ids"])
-                sequence2 = tokenizer(seq_2, seq_1, add_special_tokens=False)
-                total_length2 = len(sequence2["input_ids"])
-                self.assertLess(
-                    total_length1, model_max_length - 10, "Issue with the testing sequence, please update it."
-                )
-                self.assertGreater(
-                    total_length2, model_max_length, "Issue with the testing sequence, please update it."
-                )
-
-                # Simple
-                padding_strategies = (
-                    [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
-                )
-                for padding_state in padding_strategies:
-                    with self.subTest(f"{tokenizer.__class__.__name__} Padding: {padding_state}"):
-                        for truncation_state in [True, "longest_first", "only_first"]:
-                            with self.subTest(f"{tokenizer.__class__.__name__} Truncation: {truncation_state}"):
-                                output = tokenizer(seq_2, seq_1, padding=padding_state, truncation=truncation_state)
-                                self.assertEqual(len(output["input_ids"]), model_max_length)
-
-                                output = tokenizer(
-                                    [seq_2], [seq_1], padding=padding_state, truncation=truncation_state
-                                )
-                                self.assertEqual(len(output["input_ids"][0]), model_max_length)
-
-                        # Simple
-                        output = tokenizer(seq_1, seq_2, padding=padding_state, truncation="only_second")
-                        self.assertEqual(len(output["input_ids"]), model_max_length)
-
-                        output = tokenizer([seq_1], [seq_2], padding=padding_state, truncation="only_second")
-                        self.assertEqual(len(output["input_ids"][0]), model_max_length)
-
-                        # Simple with no truncation
-                        # Reset warnings
-                        tokenizer.deprecation_warnings = {}
-                        with self.assertLogs("mindnlp.transformers", level="WARNING") as cm:
-                            output = tokenizer(seq_1, seq_2, padding=padding_state, truncation=False)
-                            self.assertNotEqual(len(output["input_ids"]), model_max_length)
-                        self.assertEqual(len(cm.records), 1)
-                        self.assertTrue(
-                            cm.records[0].message.startswith(
-                                "Token indices sequence length is longer than the specified maximum sequence length"
-                                " for this model"
-                            )
-                        )
-
-                        tokenizer.deprecation_warnings = {}
-                        with self.assertLogs("mindnlp.transformers", level="WARNING") as cm:
-                            output = tokenizer([seq_1], [seq_2], padding=padding_state, truncation=False)
-                            self.assertNotEqual(len(output["input_ids"][0]), model_max_length)
-                        self.assertEqual(len(cm.records), 1)
-                        self.assertTrue(
-                            cm.records[0].message.startswith(
-                                "Token indices sequence length is longer than the specified maximum sequence length"
-                                " for this model"
-                            )
-                        )
-
-                truncated_first_sequence = tokenizer.encode(seq_0, add_special_tokens=False)[:-2] + tokenizer.encode(
-                    seq_1, add_special_tokens=False
-                )
-                truncated_second_sequence = (
-                    tokenizer.encode(seq_0, add_special_tokens=False)
-                    + tokenizer.encode(seq_1, add_special_tokens=False)[:-2]
-                )
-                truncated_longest_sequence = (
-                    truncated_first_sequence if len(seq0_tokens) > len(seq1_tokens) else truncated_second_sequence
-                )
-
-                overflow_first_sequence = tokenizer.encode(seq_0, add_special_tokens=False)[
-                    -(2 + stride) :
-                ] + tokenizer.encode(seq_1, add_special_tokens=False)
-                overflow_second_sequence = (
-                    tokenizer.encode(seq_0, add_special_tokens=False)
-                    + tokenizer.encode(seq_1, add_special_tokens=False)[-(2 + stride) :]
-                )
-                overflow_longest_sequence = (
-                    overflow_first_sequence if len(seq0_tokens) > len(seq1_tokens) else overflow_second_sequence
-                )
-
-                # Overflowing tokens are handled quite differently in slow and fast tokenizers
-                if isinstance(tokenizer, PreTrainedTokenizerFast):
-                    information = tokenizer(
-                        seq_0,
-                        seq_1,
-                        max_length=len(sequence) - 2,
-                        add_special_tokens=False,
-                        stride=stride,
-                        truncation="longest_first",
-                        return_overflowing_tokens=True,
-                        # add_prefix_space=False,
-                    )
-                    truncated_sequence = information["input_ids"][0]
-                    overflowing_tokens = information["input_ids"][1]
-                    self.assertEqual(len(information["input_ids"]), 2)
-
-                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
-                    self.assertEqual(truncated_sequence, truncated_longest_sequence)
-
-                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
-                    self.assertEqual(overflowing_tokens, overflow_longest_sequence)
-                else:
-                    # No overflowing tokens when using 'longest' in python tokenizers
-                    with self.assertRaises(ValueError) as context:
-                        information = tokenizer(
-                            seq_0,
-                            seq_1,
-                            max_length=len(sequence) - 2,
-                            add_special_tokens=False,
-                            stride=stride,
-                            truncation="longest_first",
-                            return_overflowing_tokens=True,
-                            # add_prefix_space=False,
-                        )
-
-                    self.assertTrue(
-                        context.exception.args[0].startswith(
-                            "Not possible to return overflowing tokens for pair of sequences with the "
-                            "`longest_first`. Please select another truncation strategy than `longest_first`, "
-                            "for instance `only_second` or `only_first`."
-                        )
-                    )
-
-                # Overflowing tokens are handled quite differently in slow and fast tokenizers
-                if isinstance(tokenizer, PreTrainedTokenizerFast):
-                    information = tokenizer(
-                        seq_0,
-                        seq_1,
-                        max_length=len(sequence) - 2,
-                        add_special_tokens=False,
-                        stride=stride,
-                        truncation=True,
-                        return_overflowing_tokens=True,
-                        # add_prefix_space=False,
-                    )
-                    truncated_sequence = information["input_ids"][0]
-                    overflowing_tokens = information["input_ids"][1]
-                    self.assertEqual(len(information["input_ids"]), 2)
-
-                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
-                    self.assertEqual(truncated_sequence, truncated_longest_sequence)
-
-                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(smallest))
-                    self.assertEqual(overflowing_tokens, overflow_longest_sequence)
-                else:
-                    # No overflowing tokens when using 'longest' in python tokenizers
-                    with self.assertRaises(ValueError) as context:
-                        information = tokenizer(
-                            seq_0,
-                            seq_1,
-                            max_length=len(sequence) - 2,
-                            add_special_tokens=False,
-                            stride=stride,
-                            truncation=True,
-                            return_overflowing_tokens=True,
-                            # add_prefix_space=False,
-                        )
-
-                    self.assertTrue(
-                        context.exception.args[0].startswith(
-                            "Not possible to return overflowing tokens for pair of sequences with the "
-                            "`longest_first`. Please select another truncation strategy than `longest_first`, "
-                            "for instance `only_second` or `only_first`."
-                        )
-                    )
-
-                information_first_truncated = tokenizer(
-                    seq_0,
-                    seq_1,
-                    max_length=len(sequence) - 2,
-                    add_special_tokens=False,
-                    stride=stride,
-                    truncation="only_first",
-                    return_overflowing_tokens=True,
-                    # add_prefix_space=False,
-                )
-                # Overflowing tokens are handled quite differently in slow and fast tokenizers
-                if isinstance(tokenizer, PreTrainedTokenizerFast):
-                    truncated_sequence = information_first_truncated["input_ids"][0]
-                    overflowing_tokens = information_first_truncated["input_ids"][1]
-                    self.assertEqual(len(information_first_truncated["input_ids"]), 2)
-
-                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
-                    self.assertEqual(truncated_sequence, truncated_first_sequence)
-
-                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(seq1_tokens))
-                    self.assertEqual(overflowing_tokens, overflow_first_sequence)
-                else:
-                    truncated_sequence = information_first_truncated["input_ids"]
-                    overflowing_tokens = information_first_truncated["overflowing_tokens"]
-
-                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
-                    self.assertEqual(truncated_sequence, truncated_first_sequence)
-
-                    self.assertEqual(len(overflowing_tokens), 2 + stride)
-                    self.assertEqual(overflowing_tokens, seq0_tokens[-(2 + stride) :])
-
-                information_second_truncated = tokenizer(
-                    seq_0,
-                    seq_1,
-                    max_length=len(sequence) - 2,
-                    add_special_tokens=False,
-                    stride=stride,
-                    truncation="only_second",
-                    return_overflowing_tokens=True,
-                    # add_prefix_space=False,
-                )
-                # Overflowing tokens are handled quite differently in slow and fast tokenizers
-                if isinstance(tokenizer, PreTrainedTokenizerFast):
-                    truncated_sequence = information_second_truncated["input_ids"][0]
-                    overflowing_tokens = information_second_truncated["input_ids"][1]
-                    self.assertEqual(len(information_second_truncated["input_ids"]), 2)
-
-                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
-                    self.assertEqual(truncated_sequence, truncated_second_sequence)
-
-                    self.assertEqual(len(overflowing_tokens), 2 + stride + len(seq0_tokens))
-                    self.assertEqual(overflowing_tokens, overflow_second_sequence)
-                else:
-                    truncated_sequence = information_second_truncated["input_ids"]
-                    overflowing_tokens = information_second_truncated["overflowing_tokens"]
-
-                    self.assertEqual(len(truncated_sequence), len(sequence) - 2)
-                    self.assertEqual(truncated_sequence, truncated_second_sequence)
-
-                    self.assertEqual(len(overflowing_tokens), 2 + stride)
-                    self.assertEqual(overflowing_tokens, seq1_tokens[-(2 + stride) :])
-
-    # def test_encode_input_type(self):
-    #     tokenizers = self.get_tokenizers(do_lower_case=False)
-    #     for tokenizer in tokenizers:
-    #         with self.subTest(f"{tokenizer.__class__.__name__}"):
-    #             sequence = "Let's encode this sequence"
-
-    #             tokens = sequence.split()  # tokenizer.tokenize(sequence)
-    #             # input_ids = tokenizer.convert_tokens_to_ids(tokens)
-    #             formatted_input = tokenizer.encode(sequence, add_special_tokens=True, add_prefix_space=False)
-
-    #             self.assertEqual(
-    #                 tokenizer.encode(tokens, is_split_into_words=True, add_special_tokens=True), formatted_input
-    #             )
-    #             # This is not supported with the Rust tokenizers
-    #             # self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
-
-    # def test_swap_special_token(self):
-    #     tokenizers = self.get_tokenizers(do_lower_case=False)
-    #     for tokenizer in tokenizers:
-    #         with self.subTest(f"{tokenizer.__class__.__name__}"):
-    #             # Our mask token
-    #             mask = "<mask>"
-    #             # We take a single word in the middle of the vocabulary
-    #             all_tokens = sorted(tokenizer.get_vocab().keys())
-    #             word = tokenizer.decode(tokenizer.encode(all_tokens[len(all_tokens)//2], add_special_tokens=False)[:1])
-
-    #             sequence_0 = "Encode " + word + " sequence"
-    #             sequence_masked_0 = "Encode " + mask + " sequence"
-
-    #             sequence_1 = word + " this sequence"
-    #             sequence_masked_1 = mask + " this sequence"
-
-    #             # Add tokens so that masked token isn't split
-    #             # tokens = [AddedToken(t, lstrip=True, normalized=False) for t in sequence.split()]
-    #             # tokenizer.add_tokens(tokens)
-    #             tokenizer.add_special_tokens(
-    #                 {"mask_token": AddedToken(mask, normalized=False)}
-    #             )  # Eat left space on Byte-level BPE tokenizers
-    #             mask_ind = tokenizer.convert_tokens_to_ids(mask)
-
-    #             # Test first masked sequence
-    #             encoded_0 = tokenizer.encode(sequence_0, add_special_tokens=False)
-    #             encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False)
-    #             self.assertEqual(len(encoded_masked), len(encoded_0))
-    #             mask_loc = encoded_masked.index(mask_ind)
-    #             encoded_masked[mask_loc] = encoded_0[mask_loc]
-
-    #             self.assertEqual(encoded_masked, encoded_0)
-
-    #             # Test second masked sequence
-    #             encoded_1 = tokenizer.encode(sequence_1, add_special_tokens=False)
-    #             encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
-    #             self.assertEqual(len(encoded_masked), len(encoded_1))
-    #             mask_loc = encoded_masked.index(mask_ind)
-    #             encoded_masked[mask_loc] = encoded_1[mask_loc]
-
-    #             self.assertEqual(encoded_masked, encoded_1)
-
-    def test_special_tokens_mask(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequence_0 = "Encode this."
-                # Testing single inputs
-                encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
-                    sequence_0, add_special_tokens=True, return_special_tokens_mask=True  # , add_prefix_space=False
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    def test_special_tokens_mask_input_pairs(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequence_0 = "Encode this."
-                sequence_1 = "This one too please."
-                encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
-                encoded_sequence += tokenizer.encode(sequence_1, add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
-                    sequence_0,
-                    sequence_1,
-                    add_special_tokens=True,
-                    return_special_tokens_mask=True,
-                    # add_prefix_space=False,
-                )
-                encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
-                special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
-                self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
-
-                filtered_sequence = [
-                    (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
-                ]
-                filtered_sequence = [x for x in filtered_sequence if x is not None]
-                self.assertEqual(encoded_sequence, filtered_sequence)
-
-    def test_padding_side_in_kwargs(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                if self.test_rust_tokenizer:
-                    tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, padding_side="left", **kwargs
-                    )
-                    self.assertEqual(tokenizer_r.padding_side, "left")
-
-                    tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, padding_side="right", **kwargs
-                    )
-                    self.assertEqual(tokenizer_r.padding_side, "right")
-
-                    self.assertRaises(
-                        ValueError,
-                        self.rust_tokenizer_class.from_pretrained,
-                        pretrained_name,
-                        padding_side="unauthorized",
-                        **kwargs,
-                    )
-
-                if self.test_slow_tokenizer:
-                    tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, padding_side="left", **kwargs)
-                    self.assertEqual(tokenizer_p.padding_side, "left")
-
-                    tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, padding_side="right", **kwargs)
-                    self.assertEqual(tokenizer_p.padding_side, "right")
-
-                    self.assertRaises(
-                        ValueError,
-                        self.tokenizer_class.from_pretrained,
-                        pretrained_name,
-                        padding_side="unauthorized",
-                        **kwargs,
-                    )
-
-    def test_truncation_side_in_kwargs(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                if self.test_rust_tokenizer:
-                    tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, truncation_side="left", **kwargs
-                    )
-                    self.assertEqual(tokenizer_r.truncation_side, "left")
-
-                    tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, truncation_side="right", **kwargs
-                    )
-                    self.assertEqual(tokenizer_r.truncation_side, "right")
-
-                    self.assertRaises(
-                        ValueError,
-                        self.rust_tokenizer_class.from_pretrained,
-                        pretrained_name,
-                        truncation_side="unauthorized",
-                        **kwargs,
-                    )
-
-                if self.test_slow_tokenizer:
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, truncation_side="left", **kwargs
-                    )
-                    self.assertEqual(tokenizer_p.truncation_side, "left")
-
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, truncation_side="right", **kwargs
-                    )
-                    self.assertEqual(tokenizer_p.truncation_side, "right")
-
-                    self.assertRaises(
-                        ValueError,
-                        self.tokenizer_class.from_pretrained,
-                        pretrained_name,
-                        truncation_side="unauthorized",
-                        **kwargs,
-                    )
-
-    def test_right_and_left_padding(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequence = "Sequence"
-                padding_size = 10
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_idx = tokenizer.pad_token_id
-
-                # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "right"
-                encoded_sequence = tokenizer.encode(sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    sequence, max_length=sequence_length + padding_size, padding="max_length"
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
-
-                # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "left"
-                encoded_sequence = tokenizer.encode(sequence)
-                sequence_length = len(encoded_sequence)
-                padded_sequence = tokenizer.encode(
-                    sequence, max_length=sequence_length + padding_size, padding="max_length"
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertEqual([padding_idx] * padding_size + encoded_sequence, padded_sequence)
-
-                # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
-                encoded_sequence = tokenizer.encode(sequence)
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(sequence, padding=True)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertEqual(encoded_sequence, padded_sequence_right)
-
-                tokenizer.padding_side = "left"
-                padded_sequence_left = tokenizer.encode(sequence, padding="longest")
-                padded_sequence_left_length = len(padded_sequence_left)
-                self.assertEqual(sequence_length, padded_sequence_left_length)
-                self.assertEqual(encoded_sequence, padded_sequence_left)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(sequence)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertEqual(encoded_sequence, padded_sequence_right)
-
-                tokenizer.padding_side = "left"
-                padded_sequence_left = tokenizer.encode(sequence, padding=False)
-                padded_sequence_left_length = len(padded_sequence_left)
-                self.assertEqual(sequence_length, padded_sequence_left_length)
-                self.assertEqual(encoded_sequence, padded_sequence_left)
-
-    def test_right_and_left_truncation(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequence = "This is a test sequence"
-
-                # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                truncation_size = 3
-                tokenizer.truncation_side = "right"
-                encoded_sequence = tokenizer.encode(sequence, add_special_tokens=False)
-                sequence_length = len(encoded_sequence)
-                # Remove EOS/BOS tokens
-                truncated_sequence = tokenizer.encode(
-                    sequence, max_length=sequence_length - truncation_size, truncation=True, add_special_tokens=False
-                )
-                truncated_sequence_length = len(truncated_sequence)
-                self.assertEqual(sequence_length, truncated_sequence_length + truncation_size)
-                self.assertEqual(encoded_sequence[:-truncation_size], truncated_sequence)
-
-                # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the truncation flag set to True
-                tokenizer.truncation_side = "left"
-                sequence_length = len(encoded_sequence)
-                truncated_sequence = tokenizer.encode(
-                    sequence, max_length=sequence_length - truncation_size, truncation=True, add_special_tokens=False
-                )
-                truncated_sequence_length = len(truncated_sequence)
-                self.assertEqual(sequence_length, truncated_sequence_length + truncation_size)
-                self.assertEqual(encoded_sequence[truncation_size:], truncated_sequence)
-
-                # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_truncation'
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.truncation_side = "right"
-                truncated_sequence_right = tokenizer.encode(sequence, truncation=True, add_special_tokens=False)
-                truncated_sequence_right_length = len(truncated_sequence_right)
-                self.assertEqual(sequence_length, truncated_sequence_right_length)
-                self.assertEqual(encoded_sequence, truncated_sequence_right)
-
-                tokenizer.truncation_side = "left"
-                truncated_sequence_left = tokenizer.encode(
-                    sequence, truncation="longest_first", add_special_tokens=False
-                )
-                truncated_sequence_left_length = len(truncated_sequence_left)
-                self.assertEqual(sequence_length, truncated_sequence_left_length)
-                self.assertEqual(encoded_sequence, truncated_sequence_left)
-
-                tokenizer.truncation_side = "right"
-                truncated_sequence_right = tokenizer.encode(sequence, add_special_tokens=False)
-                truncated_sequence_right_length = len(truncated_sequence_right)
-                self.assertEqual(sequence_length, truncated_sequence_right_length)
-                self.assertEqual(encoded_sequence, truncated_sequence_right)
-
-                tokenizer.truncation_side = "left"
-                truncated_sequence_left = tokenizer.encode(sequence, truncation=False, add_special_tokens=False)
-                truncated_sequence_left_length = len(truncated_sequence_left)
-                self.assertEqual(sequence_length, truncated_sequence_left_length)
-                self.assertEqual(encoded_sequence, truncated_sequence_left)
-
-    def test_padding_to_max_length(self):
-        """We keep this test for backward compatibility but it should be remove when `pad_to_max_length` is deprecated."""
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequence = "Sequence"
-                padding_size = 10
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_idx = tokenizer.pad_token_id
-
-                # Check that it correctly pads when a maximum length is specified along with the padding flag set to True
-                tokenizer.padding_side = "right"
-                encoded_sequence = tokenizer.encode(sequence)
-                sequence_length = len(encoded_sequence)
-                # FIXME: the next line should be padding(max_length) to avoid warning
-                padded_sequence = tokenizer.encode(
-                    sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
-                )
-                padded_sequence_length = len(padded_sequence)
-                self.assertEqual(sequence_length + padding_size, padded_sequence_length)
-                self.assertEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
-
-                # Check that nothing is done when a maximum length is not specified
-                encoded_sequence = tokenizer.encode(sequence)
-                sequence_length = len(encoded_sequence)
-
-                tokenizer.padding_side = "right"
-                padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
-                padded_sequence_right_length = len(padded_sequence_right)
-                self.assertEqual(sequence_length, padded_sequence_right_length)
-                self.assertEqual(encoded_sequence, padded_sequence_right)
-
-    def test_padding_to_multiple_of(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
-                else:
-                    empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8)
-                    normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8)
-                    for key, value in empty_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    normal_tokens = tokenizer("This", pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    # Should also work with truncation
-                    normal_tokens = tokenizer("This", padding=True, truncation=True, pad_to_multiple_of=8)
-                    for key, value in normal_tokens.items():
-                        self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
-
-                    # truncation to something which is not a multiple of pad_to_multiple_of raises an error
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.__call__,
-                        "This",
-                        padding=True,
-                        truncation=True,
-                        max_length=12,
-                        pad_to_multiple_of=8,
-                    )
-
-    def test_padding_with_attention_mask(self):
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if tokenizer.pad_token is None:
-                    self.skipTest("No padding token.")
-                if "attention_mask" not in tokenizer.model_input_names:
-                    self.skipTest("This model does not use attention mask.")
-
-                features = [
-                    {"input_ids": [1, 2, 3, 4, 5, 6], "attention_mask": [1, 1, 1, 1, 1, 0]},
-                    {"input_ids": [1, 2, 3], "attention_mask": [1, 1, 0]},
-                ]
-                padded_features = tokenizer.pad(features)
-                if tokenizer.padding_side == "right":
-                    self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [1, 1, 0, 0, 0, 0]])
-                else:
-                    self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 0]])
-
-    def test_encode_plus_with_padding(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequence = "Sequence"
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequence)
-
-                padding_size = 10
-                padding_idx = tokenizer.pad_token_id
-                token_type_padding_idx = tokenizer.pad_token_type_id
-
-                encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
-                input_ids = encoded_sequence["input_ids"]
-                special_tokens_mask = encoded_sequence["special_tokens_mask"]
-                sequence_length = len(input_ids)
-
-                # Test 'longest' and 'no_padding' don't do anything
-                tokenizer.padding_side = "right"
-
-                not_padded_sequence = tokenizer.encode_plus(
-                    sequence,
-                    padding=True,
-                    return_special_tokens_mask=True,
-                )
-                not_padded_input_ids = not_padded_sequence["input_ids"]
-
-                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
-                not_padded_sequence_length = len(not_padded_input_ids)
-
-                self.assertEqual(sequence_length, not_padded_sequence_length)
-                self.assertEqual(input_ids, not_padded_input_ids)
-                self.assertEqual(special_tokens_mask, not_padded_special_tokens_mask)
-
-                not_padded_sequence = tokenizer.encode_plus(
-                    sequence,
-                    padding=False,
-                    return_special_tokens_mask=True,
-                )
-                not_padded_input_ids = not_padded_sequence["input_ids"]
-
-                not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
-                not_padded_sequence_length = len(not_padded_input_ids)
-
-                self.assertEqual(sequence_length, not_padded_sequence_length)
-                self.assertEqual(input_ids, not_padded_input_ids)
-                self.assertEqual(special_tokens_mask, not_padded_special_tokens_mask)
-
-                # Test right padding
-                tokenizer.padding_side = "right"
-
-                right_padded_sequence = tokenizer.encode_plus(
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
-                right_padded_input_ids = right_padded_sequence["input_ids"]
-
-                right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
-                right_padded_sequence_length = len(right_padded_input_ids)
-
-                self.assertEqual(sequence_length + padding_size, right_padded_sequence_length)
-                self.assertEqual(input_ids + [padding_idx] * padding_size, right_padded_input_ids)
-                self.assertEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
-
-                # Test left padding
-                tokenizer.padding_side = "left"
-                left_padded_sequence = tokenizer.encode_plus(
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
-                left_padded_input_ids = left_padded_sequence["input_ids"]
-                left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
-                left_padded_sequence_length = len(left_padded_input_ids)
-
-                self.assertEqual(sequence_length + padding_size, left_padded_sequence_length)
-                self.assertEqual([padding_idx] * padding_size + input_ids, left_padded_input_ids)
-                self.assertEqual([1] * padding_size + special_tokens_mask, left_padded_special_tokens_mask)
-
-                if "token_type_ids" in tokenizer.model_input_names:
-                    token_type_ids = encoded_sequence["token_type_ids"]
-                    left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
-                    right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
-
-                    self.assertEqual(
-                        token_type_ids + [token_type_padding_idx] * padding_size, right_padded_token_type_ids
-                    )
-                    self.assertEqual(
-                        [token_type_padding_idx] * padding_size + token_type_ids, left_padded_token_type_ids
-                    )
-
-                if "attention_mask" in tokenizer.model_input_names:
-                    attention_mask = encoded_sequence["attention_mask"]
-                    right_padded_attention_mask = right_padded_sequence["attention_mask"]
-                    left_padded_attention_mask = left_padded_sequence["attention_mask"]
-
-                    self.assertEqual(attention_mask + [0] * padding_size, right_padded_attention_mask)
-                    self.assertEqual([0] * padding_size + attention_mask, left_padded_attention_mask)
-
-    def test_padding_warning_message_fast_tokenizer(self):
-        if not self.test_rust_tokenizer:
-            return
-
-        sequence = "This is a text"
-
-        tokenizer_fast = self.get_rust_tokenizer()
-        # check correct behaviour if no pad_token_id exists and add it eventually
-        self._check_no_pad_token_padding(tokenizer_fast, sequence)
-
-        encoding_fast = tokenizer_fast(sequence)
-
-        with self.assertLogs("mindnlp.transformers", level="WARNING") as cm:
-            tokenizer_fast.pad(encoding_fast)
-        self.assertEqual(len(cm.records), 1)
-        self.assertIn(
-            "Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to"
-            " encode the text followed by a call to the `pad` method to get a padded encoding.",
-            cm.records[0].message,
-        )
-
-        if not self.test_slow_tokenizer:
-            return
-
-        tokenizer_slow = self.get_tokenizer()
-        # check correct behaviour if no pad_token_id exists and add it eventually
-        self._check_no_pad_token_padding(tokenizer_slow, sequence)
-
-        encoding_slow = tokenizer_slow(sequence)
-
-        with self.assertLogs(level="WARNING") as cm:
-            # We want to assert there are no warnings, but the 'assertLogs' method does not support that.
-            # Therefore, we are adding a dummy warning, and then we will assert it is the only warning.
-            logger.warning("Dummy warning")
-            tokenizer_slow.pad(encoding_slow)
-        self.assertEqual(len(cm.records), 1)
-        self.assertIn(
-            "Dummy warning",
-            cm.records[0].message,
-        )
-
-    def test_separate_tokenizers(self):
-        # This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
-        # we're loading an S3 configuration from a pre-trained identifier, and we have no way of testing those today.
-
-        tokenizers = self.get_tokenizers(random_argument=True)
-        new_tokenizers = self.get_tokenizers(random_argument=False)
-
-        for tokenizer, new_tokenizer in zip(tokenizers, new_tokenizers):
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                self.assertTrue(tokenizer.init_kwargs["random_argument"])
-                self.assertTrue(tokenizer.init_kwargs["random_argument"])
-                self.assertFalse(new_tokenizer.init_kwargs["random_argument"])
-
-    def test_get_vocab(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                vocab_dict = tokenizer.get_vocab()
-                self.assertIsInstance(vocab_dict, dict)
-                self.assertGreaterEqual(len(tokenizer), len(vocab_dict))
-
-                vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
-                self.assertEqual(len(vocab), len(tokenizer))
-
-                tokenizer.add_tokens(["asdfasdfasdfasdf"])
-                vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
-                self.assertEqual(len(vocab), len(tokenizer))
-
-    def test_conversion_reversible(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                vocab = tokenizer.get_vocab()
-                for word, ind in vocab.items():
-                    if word == tokenizer.unk_token:
-                        continue
-                    self.assertEqual(tokenizer.convert_tokens_to_ids(word), ind)
-                    self.assertEqual(tokenizer.convert_ids_to_tokens(ind), word)
-
-    def test_call(self):
-        # Tests that all call wrap to encode_plus and batch_encode_plus
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                # Test not batched
-                encoded_sequences_1 = tokenizer.encode_plus(sequences[0])
-                encoded_sequences_2 = tokenizer(sequences[0])
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test not batched pairs
-                encoded_sequences_1 = tokenizer.encode_plus(sequences[0], sequences[1])
-                encoded_sequences_2 = tokenizer(sequences[0], sequences[1])
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test batched
-                encoded_sequences_1 = tokenizer.batch_encode_plus(sequences)
-                encoded_sequences_2 = tokenizer(sequences)
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-                # Test batched pairs
-                encoded_sequences_1 = tokenizer.batch_encode_plus(list(zip(sequences, sequences)))
-                encoded_sequences_2 = tokenizer(sequences, sequences)
-                self.assertEqual(encoded_sequences_1, encoded_sequences_2)
-
-    def test_batch_encode_plus_batch_sequence_length(self):
-        # Tests that all encoded values have the correct size
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                encoded_sequences = [tokenizer.encode_plus(sequence) for sequence in sequences]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(sequences, padding=False)
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-                maximum_length = len(
-                    max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
-                )
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences_padded = [
-                    tokenizer.encode_plus(sequence, max_length=maximum_length, padding="max_length")
-                    for sequence in sequences
-                ]
-
-                encoded_sequences_batch_padded = tokenizer.batch_encode_plus(sequences, padding=True)
-                self.assertListEqual(
-                    encoded_sequences_padded,
-                    self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
-                )
-
-                # check 'longest' is unsensitive to a max length
-                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(sequences, padding=True)
-                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
-                    sequences, max_length=maximum_length + 10, padding="longest"
-                )
-                for key in encoded_sequences_batch_padded_1.keys():
-                    self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key],
-                        encoded_sequences_batch_padded_2[key],
-                    )
-
-                # check 'no_padding' is unsensitive to a max length
-                encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(sequences, padding=False)
-                encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
-                    sequences, max_length=maximum_length + 10, padding=False
-                )
-                for key in encoded_sequences_batch_padded_1.keys():
-                    self.assertListEqual(
-                        encoded_sequences_batch_padded_1[key],
-                        encoded_sequences_batch_padded_2[key],
-                    )
-
-    def test_added_token_are_matched_longest_first(self):
-        if not self.test_slow_tokenizer:
-            self.skipTest("This test is only for slow tokenizers")
-            return
-        tokenizers = self.get_tokenizers(fast=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                try:
-                    tokenizer.add_tokens([AddedToken("extra_id_1")])
-                    tokenizer.add_tokens([AddedToken("extra_id_100")])
-                except Exception:
-                    # Canine cannot add tokens which are not codepoints
-                    self.skipTest("Cannot add those Added tokens")
-
-                # XXX: This used to split on `extra_id_1` first we're matching
-                # longest first now.
-                tokens = tokenizer.tokenize("This is some extra_id_100")
-                self.assertIn("extra_id_100", tokens)
-
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                tokenizer.add_tokens([AddedToken("extra_id_100")])
-                tokenizer.add_tokens([AddedToken("extra_id_1")])
-
-                tokens = tokenizer.tokenize("This is some extra_id_100")
-                self.assertIn("extra_id_100", tokens)
-
-    def test_added_token_serializable(self):
-        # TODO this is tested 10_000 times....
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                new_token = AddedToken("new_token", lstrip=True)
-                tokenizer.add_tokens([new_token])
-
-                with tempfile.TemporaryDirectory() as tmp_dir_name:
-                    tokenizer.save_pretrained(tmp_dir_name)
-                    tokenizer.from_pretrained(tmp_dir_name)
-
-    def test_batch_encode_plus_padding(self):
-        # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
-
-        # Right padding tests
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                max_length = 100
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences = [
-                    tokenizer.encode_plus(sequence, max_length=max_length, padding="max_length")
-                    for sequence in sequences
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(
-                    sequences, max_length=max_length, padding="max_length"
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-        # Left padding tests
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                tokenizer.padding_side = "left"
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                max_length = 100
-
-                # check correct behaviour if no pad_token_id exists and add it eventually
-                self._check_no_pad_token_padding(tokenizer, sequences)
-
-                encoded_sequences = [
-                    tokenizer.encode_plus(sequence, max_length=max_length, padding="max_length")
-                    for sequence in sequences
-                ]
-                encoded_sequences_batch = tokenizer.batch_encode_plus(
-                    sequences, max_length=max_length, padding="max_length"
-                )
-                self.assertListEqual(
-                    encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
-                )
-
-    def test_pretokenized_inputs(self):
-        # Test when inputs are pretokenized
-
-        tokenizers = self.get_tokenizers(do_lower_case=False)  # , add_prefix_space=True)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if hasattr(tokenizer, "add_prefix_space") and not tokenizer.add_prefix_space:
-                    continue
-
-                # Prepare a sequence from our tokenizer vocabulary
-                sequence, ids = self.get_clean_sequence(tokenizer, with_prefix_space=True, max_length=20)
-                # sequence = " " + sequence  # To be sure the byte-level tokenizers are feeling good
-                token_sequence = sequence.split()
-                # sequence_no_prefix_space = sequence.strip()
-
-                # Test encode for pretokenized inputs
-                output = tokenizer.encode(token_sequence, is_split_into_words=True, add_special_tokens=False)
-                output_sequence = tokenizer.encode(sequence, add_special_tokens=False)
-                self.assertEqual(output, output_sequence)
-
-                output = tokenizer.encode(token_sequence, is_split_into_words=True, add_special_tokens=True)
-                output_sequence = tokenizer.encode(sequence, add_special_tokens=True)
-                self.assertEqual(output, output_sequence)
-
-                # Test encode_plus for pretokenized inputs
-                output = tokenizer.encode_plus(token_sequence, is_split_into_words=True, add_special_tokens=False)
-                output_sequence = tokenizer.encode_plus(sequence, add_special_tokens=False)
-                for key in output.keys():
-                    self.assertEqual(output[key], output_sequence[key])
-                output = tokenizer.encode_plus(token_sequence, is_split_into_words=True, add_special_tokens=True)
-                output_sequence = tokenizer.encode_plus(sequence, add_special_tokens=True)
-                for key in output.keys():
-                    self.assertEqual(output[key], output_sequence[key])
-
-                # Test batch_encode_plus for pretokenized inputs
-                sequence_batch = [sequence.strip()] * 2 + [sequence.strip() + " " + sequence.strip()]
-                token_sequence_batch = [s.split() for s in sequence_batch]
-                sequence_batch_cleaned_up_spaces = [" " + " ".join(s) for s in token_sequence_batch]
-
-                output = tokenizer.batch_encode_plus(
-                    token_sequence_batch, is_split_into_words=True, add_special_tokens=False
-                )
-                output_sequence = tokenizer.batch_encode_plus(
-                    sequence_batch_cleaned_up_spaces, add_special_tokens=False
-                )
-                for key in output.keys():
-                    self.assertEqual(output[key], output_sequence[key])
-                output = tokenizer.batch_encode_plus(
-                    token_sequence_batch, is_split_into_words=True, add_special_tokens=True
-                )
-                output_sequence = tokenizer.batch_encode_plus(
-                    sequence_batch_cleaned_up_spaces, add_special_tokens=True
-                )
-                for key in output.keys():
-                    self.assertEqual(output[key], output_sequence[key])
-
-                # Test encode for pretokenized inputs pairs
-                output = tokenizer.encode(
-                    token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=False
-                )
-                output_sequence = tokenizer.encode(sequence, sequence, add_special_tokens=False)
-                self.assertEqual(output, output_sequence)
-                output = tokenizer.encode(
-                    token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=True
-                )
-                output_sequence = tokenizer.encode(sequence, sequence, add_special_tokens=True)
-                self.assertEqual(output, output_sequence)
-
-                # Test encode_plus for pretokenized inputs pairs
-                output = tokenizer.encode_plus(
-                    token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=False
-                )
-                output_sequence = tokenizer.encode_plus(sequence, sequence, add_special_tokens=False)
-                for key in output.keys():
-                    self.assertEqual(output[key], output_sequence[key])
-                output = tokenizer.encode_plus(
-                    token_sequence, token_sequence, is_split_into_words=True, add_special_tokens=True
-                )
-                output_sequence = tokenizer.encode_plus(sequence, sequence, add_special_tokens=True)
-                for key in output.keys():
-                    self.assertEqual(output[key], output_sequence[key])
-
-                # Test batch_encode_plus for pretokenized inputs pairs
-                sequence_pair_batch = [(sequence.strip(), sequence.strip())] * 2 + [
-                    (sequence.strip() + " " + sequence.strip(), sequence.strip())
-                ]
-                token_sequence_pair_batch = [tuple(s.split() for s in pair) for pair in sequence_pair_batch]
-                sequence_pair_batch_cleaned_up_spaces = [
-                    tuple(" " + " ".join(s) for s in pair) for pair in token_sequence_pair_batch
-                ]
-
-                output = tokenizer.batch_encode_plus(
-                    token_sequence_pair_batch, is_split_into_words=True, add_special_tokens=False
-                )
-                output_sequence = tokenizer.batch_encode_plus(
-                    sequence_pair_batch_cleaned_up_spaces, add_special_tokens=False
-                )
-                for key in output.keys():
-                    self.assertEqual(output[key], output_sequence[key])
-                output = tokenizer.batch_encode_plus(
-                    token_sequence_pair_batch, is_split_into_words=True, add_special_tokens=True
-                )
-                output_sequence = tokenizer.batch_encode_plus(
-                    sequence_pair_batch_cleaned_up_spaces, add_special_tokens=True
-                )
-                for key in output.keys():
-                    self.assertEqual(output[key], output_sequence[key])
-
-    def test_prepare_for_model(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                string_sequence = "Testing the prepare_for_model method."
-                ids = tokenizer.encode(string_sequence, add_special_tokens=False)
-                prepared_input_dict = tokenizer.prepare_for_model(ids, add_special_tokens=True)
-
-                input_dict = tokenizer.encode_plus(string_sequence, add_special_tokens=True)
-
-                self.assertEqual(input_dict, prepared_input_dict)
-
-    def test_batch_encode_plus_overflowing_tokens(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            string_sequences = ["Testing the prepare_for_model method.", "Test"]
-
-            if tokenizer.pad_token is None:
-                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-
-            tokenizer.batch_encode_plus(
-                string_sequences, return_overflowing_tokens=True, truncation=True, padding=True, max_length=3
-            )
-
-
-    def _check_no_pad_token_padding(self, tokenizer, sequences):
-        # if tokenizer does not have pad_token_id, an error should be thrown
-        if tokenizer.pad_token_id is None:
-            with self.assertRaises(ValueError):
-                if isinstance(sequences, list):
-                    tokenizer.batch_encode_plus(sequences, padding="longest")
-                else:
-                    tokenizer.encode_plus(sequences, padding=True)
-
-            # add pad_token_id to pass subsequent tests
-            tokenizer.add_special_tokens({"pad_token": "<PAD>"})
-
-    @require_mindspore
-    @slow
-    def test_mindspore_encode_plus_sent_to_model(self):
-        from mindnlp.transformers import MODEL_MAPPING, TOKENIZER_MAPPING
-
-        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
-
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
-
-                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
-                config = config_class()
-
-                if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
-
-                model = model_class(config)
-
-                # Make sure the model contains at least the full vocabulary size in its embedding matrix
-                is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight")
-                if is_using_common_embeddings:
-                    self.assertGreaterEqual(model.get_input_embeddings().weight.shape[0], len(tokenizer))
-
-                # Build sequence
-                first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
-                sequence = " ".join(first_ten_tokens)
-                encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="ms")
-
-                # Ensure that the BatchEncoding.to() method works.
-
-                batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="ms")
-                # This should not fail
-
-                model(**encoded_sequence)
-                model(**batch_encoded_sequence)
-
-        # if self.test_rust_tokenizer:
-        #     fast_tokenizer = self.get_rust_tokenizer()
-        #     encoded_sequence_fast = fast_tokenizer.encode_plus(sequence, return_tensors="ms")
-        #     batch_encoded_sequence_fast = fast_tokenizer.batch_encode_plus([sequence, sequence], return_tensors="ms")
-        #     # This should not fail
-        #     model(**encoded_sequence_fast)
-        #     model(**batch_encoded_sequence_fast)
-
- 
-    @require_mindspore
-    @slow
-    def test_np_encode_plus_sent_to_model(self):
-        from mindnlp.transformers import MODEL_MAPPING, TOKENIZER_MAPPING
-
-        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
-
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    return
-
-                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
-                config = config_class()
-
-                if config.is_encoder_decoder or config.pad_token_id is None:
-                    return
-
-                # Build sequence
-                first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
-                sequence = " ".join(first_ten_tokens)
-                encoded_sequence = tokenizer.encode_plus(sequence, return_tensors="np")
-                batch_encoded_sequence = tokenizer.batch_encode_plus([sequence, sequence], return_tensors="np")
-
-                # TODO: add forward through JAX/Flax when PR is merged
-                # This is currently here to make ruff happy !
-                if encoded_sequence is None:
-                    raise ValueError("Cannot convert list to numpy tensor on  encode_plus()")
-
-                if batch_encoded_sequence is None:
-                    raise ValueError("Cannot convert list to numpy tensor on  batch_encode_plus()")
-
-                if self.test_rust_tokenizer:
-                    fast_tokenizer = self.get_rust_tokenizer()
-                    encoded_sequence_fast = fast_tokenizer.encode_plus(sequence, return_tensors="np")
-                    batch_encoded_sequence_fast = fast_tokenizer.batch_encode_plus(
-                        [sequence, sequence], return_tensors="np"
-                    )
-
-                    # TODO: add forward through JAX/Flax when PR is merged
-                    # This is currently here to make ruff happy !
-                    if encoded_sequence_fast is None:
-                        raise ValueError("Cannot convert list to numpy tensor on  encode_plus() (fast)")
-
-                    if batch_encoded_sequence_fast is None:
-                        raise ValueError("Cannot convert list to numpy tensor on  batch_encode_plus() (fast)")
-
-    @require_mindspore
-    def test_prepare_seq2seq_batch(self):
-        if not self.test_seq2seq:
-            return
-
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Longer text that will definitely require truncation.
-                src_text = [
-                    " UN Chief Says There Is No Military Solution in Syria",
-                    " Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for"
-                    " Syria is that 'there is no military solution' to the nearly five-year conflict and more weapons"
-                    " will only worsen the violence and misery for millions of people.",
-                ]
-                tgt_text = [
-                    "Şeful ONU declară că nu există o soluţie militară în Siria",
-                    "Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al"
-                    ' Rusiei pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi'
-                    " că noi arme nu vor face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.",
-                ]
-                try:
-                    batch = tokenizer.prepare_seq2seq_batch(
-                        src_texts=src_text,
-                        tgt_texts=tgt_text,
-                        max_length=3,
-                        max_target_length=10,
-                        return_tensors="ms",
-                        src_lang="en_XX",  # this should be ignored (for all but mbart) but not cause an error
-                    )
-                except NotImplementedError:
-                    return
-                self.assertEqual(batch.input_ids.shape[1], 3)
-                self.assertEqual(batch.labels.shape[1], 10)
-                # max_target_length will default to max_length if not specified
-                batch = tokenizer.prepare_seq2seq_batch(
-                    src_text, tgt_texts=tgt_text, max_length=3, return_tensors="ms"
-                )
-                self.assertEqual(batch.input_ids.shape[1], 3)
-                self.assertEqual(batch.labels.shape[1], 3)
-
-                batch_encoder_only = tokenizer.prepare_seq2seq_batch(
-                    src_texts=src_text, max_length=3, max_target_length=10, return_tensors="ms"
-                )
-                self.assertEqual(batch_encoder_only.input_ids.shape[1], 3)
-                self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
-                self.assertNotIn("decoder_input_ids", batch_encoder_only)
-
-    def test_is_fast(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                # Check is_fast is set correctly
-                self.assertTrue(tokenizer_r.is_fast)
-
-                if self.test_slow_tokenizer:
-                    tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                    self.assertFalse(tokenizer_p.is_fast)
-
-    def test_fast_only_inputs(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                # Ensure None raise an error
-                self.assertRaises(TypeError, tokenizer_r.tokenize, None)
-                self.assertRaises(TypeError, tokenizer_r.encode, None)
-                self.assertRaises(TypeError, tokenizer_r.encode_plus, None)
-                self.assertRaises(TypeError, tokenizer_r.batch_encode_plus, None)
-
-    def test_alignement_methods(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
-                text = " ".join(words)
-                batch_size = 3
-
-                encoding = tokenizer_r.encode_plus(text, add_special_tokens=False)
-
-                batch_encoding = tokenizer_r.batch_encode_plus([text] * batch_size, add_special_tokens=False)
-                num_tokens = len(encoding["input_ids"])
-
-                last_word_index = len(words) - 1
-                last_token_index = num_tokens - 1
-                last_batch_index = batch_size - 1
-                last_char_index = len(text) - 1
-
-                # words, tokens
-                self.assertEqual(len(encoding.words(0)), num_tokens)
-                self.assertEqual(max(encoding.words(0)), last_word_index)
-                self.assertEqual(min(encoding.words(0)), 0)
-                self.assertEqual(len(batch_encoding.words(last_batch_index)), num_tokens)
-                self.assertEqual(max(batch_encoding.words(last_batch_index)), last_word_index)
-                self.assertEqual(min(batch_encoding.words(last_batch_index)), 0)
-                self.assertEqual(len(encoding.tokens(0)), num_tokens)
-
-                # Assert token_to_word
-                self.assertEqual(encoding.token_to_word(0), 0)
-                self.assertEqual(encoding.token_to_word(0, 0), 0)
-                self.assertEqual(encoding.token_to_word(last_token_index), last_word_index)
-                self.assertEqual(encoding.token_to_word(0, last_token_index), last_word_index)
-                self.assertEqual(batch_encoding.token_to_word(1, 0), 0)
-                self.assertEqual(batch_encoding.token_to_word(0, last_token_index), last_word_index)
-                self.assertEqual(batch_encoding.token_to_word(last_batch_index, last_token_index), last_word_index)
-
-                # Assert word_to_tokens
-                self.assertEqual(encoding.word_to_tokens(0).start, 0)
-                self.assertEqual(encoding.word_to_tokens(0, 0).start, 0)
-                self.assertEqual(encoding.word_to_tokens(last_word_index).end, last_token_index + 1)
-                self.assertEqual(encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
-                self.assertEqual(batch_encoding.word_to_tokens(1, 0).start, 0)
-                self.assertEqual(batch_encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
-                self.assertEqual(
-                    batch_encoding.word_to_tokens(last_batch_index, last_word_index).end, last_token_index + 1
-                )
-
-                # Assert token_to_chars
-                self.assertEqual(encoding.token_to_chars(0).start, 0)
-                self.assertEqual(encoding.token_to_chars(0, 0).start, 0)
-                self.assertEqual(encoding.token_to_chars(last_token_index).end, last_char_index + 1)
-                self.assertEqual(encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
-                self.assertEqual(batch_encoding.token_to_chars(1, 0).start, 0)
-                self.assertEqual(batch_encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
-                self.assertEqual(
-                    batch_encoding.token_to_chars(last_batch_index, last_token_index).end, last_char_index + 1
-                )
-
-                # Assert char_to_token
-                self.assertEqual(encoding.char_to_token(0), 0)
-                self.assertEqual(encoding.char_to_token(0, 0), 0)
-                self.assertEqual(encoding.char_to_token(last_char_index), last_token_index)
-                self.assertEqual(encoding.char_to_token(0, last_char_index), last_token_index)
-                self.assertEqual(batch_encoding.char_to_token(1, 0), 0)
-                self.assertEqual(batch_encoding.char_to_token(0, last_char_index), last_token_index)
-                self.assertEqual(batch_encoding.char_to_token(last_batch_index, last_char_index), last_token_index)
-
-                # Assert char_to_word
-                self.assertEqual(encoding.char_to_word(0), 0)
-                self.assertEqual(encoding.char_to_word(0, 0), 0)
-                self.assertEqual(encoding.char_to_word(last_char_index), last_word_index)
-                self.assertEqual(encoding.char_to_word(0, last_char_index), last_word_index)
-                self.assertEqual(batch_encoding.char_to_word(1, 0), 0)
-                self.assertEqual(batch_encoding.char_to_word(0, last_char_index), last_word_index)
-                self.assertEqual(batch_encoding.char_to_word(last_batch_index, last_char_index), last_word_index)
-
-                # Assert word_to_chars
-                self.assertEqual(encoding.word_to_chars(0).start, 0)
-                self.assertEqual(encoding.word_to_chars(0, 0).start, 0)
-                self.assertEqual(encoding.word_to_chars(last_word_index).end, last_char_index + 1)
-                self.assertEqual(encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
-                self.assertEqual(batch_encoding.word_to_chars(1, 0).start, 0)
-                self.assertEqual(batch_encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
-                self.assertEqual(
-                    batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1
-                )
-
-                # Assert token_to_sequence
-                self.assertEqual(encoding.token_to_sequence(num_tokens // 2), 0)
-                self.assertEqual(encoding.token_to_sequence(0, num_tokens // 2), 0)
-                self.assertEqual(batch_encoding.token_to_sequence(1, num_tokens // 2), 0)
-                self.assertEqual(batch_encoding.token_to_sequence(0, num_tokens // 2), 0)
-                self.assertEqual(batch_encoding.token_to_sequence(last_batch_index, num_tokens // 2), 0)
-
-                # Pair of input sequences
-
-                words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
-                text = " ".join(words)
-                pair_words = ["Amazing", "example", "full", "of", "inspiration"]
-                pair_text = " ".join(pair_words)
-                batch_size = 3
-                index_word_in_first_seq = words.index("inspiration")
-                index_word_in_pair_seq = pair_words.index("inspiration")
-                index_char_in_first_seq = text.find("inspiration")
-                index_char_in_pair_seq = pair_text.find("inspiration")
-
-                pair_encoding = tokenizer_r.encode_plus(text, pair_text, add_special_tokens=False)
-
-                pair_batch_encoding = tokenizer_r.batch_encode_plus(
-                    [(text, pair_text)] * batch_size, add_special_tokens=False
-                )
-                num_tokens = len(encoding["input_ids"])
-
-                last_word_index = len(words) - 1
-                last_token_index = num_tokens - 1
-                last_batch_index = batch_size - 1
-                last_char_index = len(text) - 1
-
-                # Assert word_to_tokens
-                self.assertNotEqual(
-                    pair_encoding.word_to_tokens(index_word_in_first_seq, sequence_index=0).start,
-                    pair_encoding.word_to_tokens(index_word_in_pair_seq, sequence_index=1).start,
-                )
-                self.assertEqual(
-                    pair_encoding["input_ids"][
-                        pair_encoding.word_to_tokens(index_word_in_first_seq, sequence_index=0).start
-                    ],
-                    pair_encoding["input_ids"][
-                        pair_encoding.word_to_tokens(index_word_in_pair_seq, sequence_index=1).start
-                    ],
-                )
-                self.assertNotEqual(
-                    pair_batch_encoding.word_to_tokens(1, index_word_in_first_seq, sequence_index=0).start,
-                    pair_batch_encoding.word_to_tokens(1, index_word_in_pair_seq, sequence_index=1).start,
-                )
-                self.assertEqual(
-                    pair_batch_encoding["input_ids"][1][
-                        pair_batch_encoding.word_to_tokens(1, index_word_in_first_seq, sequence_index=0).start
-                    ],
-                    pair_batch_encoding["input_ids"][1][
-                        pair_batch_encoding.word_to_tokens(1, index_word_in_pair_seq, sequence_index=1).start
-                    ],
-                )
-
-                # Assert char_to_token
-                self.assertNotEqual(
-                    pair_encoding.char_to_token(index_char_in_first_seq, sequence_index=0),
-                    pair_encoding.char_to_token(index_char_in_pair_seq, sequence_index=1),
-                )
-                self.assertEqual(
-                    pair_encoding["input_ids"][pair_encoding.char_to_token(index_char_in_first_seq, sequence_index=0)],
-                    pair_encoding["input_ids"][pair_encoding.char_to_token(index_char_in_pair_seq, sequence_index=1)],
-                )
-                self.assertNotEqual(
-                    pair_batch_encoding.char_to_token(1, index_char_in_first_seq, sequence_index=0),
-                    pair_batch_encoding.char_to_token(1, index_char_in_pair_seq, sequence_index=1),
-                )
-                self.assertEqual(
-                    pair_batch_encoding["input_ids"][1][
-                        pair_batch_encoding.char_to_token(1, index_char_in_first_seq, sequence_index=0)
-                    ],
-                    pair_batch_encoding["input_ids"][1][
-                        pair_batch_encoding.char_to_token(1, index_char_in_pair_seq, sequence_index=1)
-                    ],
-                )
-
-                # Assert char_to_word
-                self.assertNotEqual(
-                    pair_encoding.char_to_word(index_char_in_first_seq, sequence_index=0),
-                    pair_encoding.char_to_word(index_char_in_pair_seq, sequence_index=1),
-                )
-                self.assertEqual(
-                    words[pair_encoding.char_to_word(index_char_in_first_seq, sequence_index=0)],
-                    pair_words[pair_encoding.char_to_word(index_char_in_pair_seq, sequence_index=1)],
-                )
-                self.assertNotEqual(
-                    pair_batch_encoding.char_to_word(1, index_char_in_first_seq, sequence_index=0),
-                    pair_batch_encoding.char_to_word(1, index_char_in_pair_seq, sequence_index=1),
-                )
-                self.assertEqual(
-                    words[pair_batch_encoding.char_to_word(1, index_char_in_first_seq, sequence_index=0)],
-                    pair_words[pair_batch_encoding.char_to_word(1, index_char_in_pair_seq, sequence_index=1)],
-                )
-
-                # Assert word_to_chars
-                self.assertNotEqual(
-                    pair_encoding.word_to_chars(index_word_in_first_seq, sequence_index=0).start,
-                    pair_encoding.word_to_chars(index_word_in_pair_seq, sequence_index=1).start,
-                )
-                self.assertEqual(
-                    text[pair_encoding.word_to_chars(index_word_in_first_seq, sequence_index=0).start],
-                    pair_text[pair_encoding.word_to_chars(index_word_in_pair_seq, sequence_index=1).start],
-                )
-                self.assertNotEqual(
-                    pair_batch_encoding.word_to_chars(1, index_word_in_first_seq, sequence_index=0).start,
-                    pair_batch_encoding.word_to_chars(1, index_word_in_pair_seq, sequence_index=1).start,
-                )
-                self.assertEqual(
-                    text[pair_batch_encoding.word_to_chars(1, index_word_in_first_seq, sequence_index=0).start],
-                    pair_text[pair_batch_encoding.word_to_chars(1, index_word_in_pair_seq, sequence_index=1).start],
-                )
-
-                # Assert token_to_sequence
-                pair_encoding = tokenizer_r.encode_plus(text, pair_text, add_special_tokens=True)
-
-                pair_sequence_ids = [
-                    pair_encoding.token_to_sequence(i) for i in range(len(pair_encoding["input_ids"]))
-                ]
-                self.assertIn(0, pair_sequence_ids)
-                self.assertIn(1, pair_sequence_ids)
-                if tokenizer_r.num_special_tokens_to_add(pair=True):
-                    self.assertIn(None, pair_sequence_ids)
-
-                pair_batch_encoding = tokenizer_r.batch_encode_plus(
-                    [(text, pair_text)] * batch_size, add_special_tokens=True
-                )
-                pair_batch_sequence_ids = [
-                    pair_batch_encoding.token_to_sequence(1, i)
-                    for i in range(len(pair_batch_encoding["input_ids"][0]))
-                ]
-                self.assertIn(0, pair_batch_sequence_ids)
-                self.assertIn(1, pair_batch_sequence_ids)
-                if tokenizer_r.num_special_tokens_to_add(pair=True):
-                    self.assertIn(None, pair_batch_sequence_ids)
-
-    def test_tokenization_python_rust_equals(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                # Ensure basic input match
-                input_p = tokenizer_p.encode_plus(self._data)
-                input_r = tokenizer_r.encode_plus(self._data)
-
-                for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
-                    self.assertSequenceEqual(input_p[key], input_r[key])
-
-                input_pairs_p = tokenizer_p.encode_plus(self._data, self._data)
-                input_pairs_r = tokenizer_r.encode_plus(self._data, self._data)
-
-                for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
-                    self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
-
-                # Ensure truncation match
-                input_p = tokenizer_p.encode_plus(self._data, max_length=512, truncation=True)
-                input_r = tokenizer_r.encode_plus(self._data, max_length=512, truncation=True)
-
-                for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
-                    self.assertSequenceEqual(input_p[key], input_r[key])
-
-                # Ensure truncation with stride match
-                input_p = tokenizer_p.encode_plus(
-                    self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
-                )
-                input_r = tokenizer_r.encode_plus(
-                    self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
-                )
-
-                for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
-                    self.assertSequenceEqual(input_p[key], input_r[key][0])
-
-    def test_num_special_tokens_to_add_equal(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                # Check we have the same number of added_tokens for both pair and non-pair inputs.
-                self.assertEqual(
-                    tokenizer_r.num_special_tokens_to_add(False), tokenizer_p.num_special_tokens_to_add(False)
-                )
-                self.assertEqual(
-                    tokenizer_r.num_special_tokens_to_add(True), tokenizer_p.num_special_tokens_to_add(True)
-                )
-
-    def test_max_length_equal(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                # Check we have the correct max_length for both pair and non-pair inputs.
-                self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
-                self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
-
-    def test_special_tokens_map_equal(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                # sometimes the tokenizer saved online is not the same
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                # Assert the set of special tokens match.
-                self.assertSequenceEqual(
-                    tokenizer_p.special_tokens_map.items(),
-                    tokenizer_r.special_tokens_map.items(),
-                )
-
-    def test_add_tokens(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                vocab_size = len(tokenizer_r)
-                self.assertEqual(tokenizer_r.add_tokens(""), 0)
-                self.assertEqual(tokenizer_r.add_tokens("testoken"), 1)
-                self.assertEqual(tokenizer_r.add_tokens(["testoken1", "testtoken2"]), 2)
-                self.assertEqual(len(tokenizer_r), vocab_size + 3)
-
-                self.assertEqual(tokenizer_r.add_special_tokens({}), 0)
-                self.assertEqual(tokenizer_r.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2)
-                self.assertRaises(
-                    AssertionError, tokenizer_r.add_special_tokens, {"additional_special_tokens": "<testtoken1>"}
-                )
-                self.assertEqual(tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken2>"]}), 1)
-                self.assertEqual(
-                    tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken3>", "<testtoken4>"]}), 2
-                )
-                self.assertIn("<testtoken3>", tokenizer_r.special_tokens_map["additional_special_tokens"])
-                self.assertIsInstance(tokenizer_r.special_tokens_map["additional_special_tokens"], list)
-                self.assertGreaterEqual(len(tokenizer_r.special_tokens_map["additional_special_tokens"]), 2)
-
-                self.assertEqual(len(tokenizer_r), vocab_size + 8)
-
-    def test_offsets_mapping(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                text = "Wonderful no inspiration example with subtoken"
-                pair = "Along with an awesome pair"
-
-                # No pair
-                tokens_with_offsets = tokenizer_r.encode_plus(
-                    text, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
-                )
-                added_tokens = tokenizer_r.num_special_tokens_to_add(False)
-                offsets = tokens_with_offsets["offset_mapping"]
-
-                # Assert there is the same number of tokens and offsets
-                self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
-
-                # Assert there is online added_tokens special_tokens
-                self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
-
-                # Pairs
-                tokens_with_offsets = tokenizer_r.encode_plus(
-                    text, pair, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
-                )
-                added_tokens = tokenizer_r.num_special_tokens_to_add(True)
-                offsets = tokens_with_offsets["offset_mapping"]
-
-                # Assert there is the same number of tokens and offsets
-                self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
-
-                # Assert there is online added_tokens special_tokens
-                self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
-
-    def test_batch_encode_dynamic_overflowing(self):
-        """
-        When calling batch_encode with multiple sequence it can returns different number of
-        overflowing encoding for each sequence:
-        [
-          Sequence 1: [Encoding 1, Encoding 2],
-          Sequence 2: [Encoding 1],
-          Sequence 3: [Encoding 1, Encoding 2, ... Encoding N]
-        ]
-        This needs to be padded so that it can represented as a tensor
-        """
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
-                if is_mindspore_available():
-                    returned_tensor = "ms"
-                else:
-                    return
-
-                if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
-                    return
-
-                tokens = tokenizer.encode_plus(
-                    "HuggingFace is solving NLP one commit at a time",
-                    max_length=6,
-                    padding=True,
-                    truncation=True,
-                    return_tensors=returned_tensor,
-                    return_overflowing_tokens=True,
-                )
-
-                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
-                    self.assertEqual(len(tokens[key].shape), 2)
-
-                # Mono sample
-                tokens = tokenizer.batch_encode_plus(
-                    ["HuggingFace is solving NLP one commit at a time"],
-                    max_length=6,
-                    padding=True,
-                    truncation="only_first",
-                    return_tensors=returned_tensor,
-                    return_overflowing_tokens=True,
-                )
-
-                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
-                    self.assertEqual(len(tokens[key].shape), 2)
-                    self.assertEqual(tokens[key].shape[-1], 6)
-
-                # Multi sample
-                tokens = tokenizer.batch_encode_plus(
-                    ["HuggingFace is solving NLP one commit at a time", "Very tiny input"],
-                    max_length=6,
-                    padding=True,
-                    truncation="only_first",
-                    return_tensors=returned_tensor,
-                    return_overflowing_tokens=True,
-                )
-
-                for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
-                    self.assertEqual(len(tokens[key].shape), 2)
-                    self.assertEqual(tokens[key].shape[-1], 6)
-
-    def test_compare_pretokenized_inputs(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                if hasattr(tokenizer_p, "add_prefix_space") and not tokenizer_p.add_prefix_space:
-                    continue  # Too hard to test for now
-
-                # Input string
-                pretokenized_input_simple = "This is a sample input".split()
-                pretokenized_input_pair = "This is a sample pair".split()
-
-                # Test encode for pretokenized inputs
-                output_r = tokenizer_r.encode(
-                    pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
-                )
-                output_p = tokenizer_p.encode(
-                    pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
-                )
-                self.assertEqual(output_p, output_r)
-
-                kwargs = {
-                    "is_split_into_words": True,
-                    # "return_token_type_ids": True,  # Use the defaults for each tokenizers
-                    # "return_attention_mask": True,  # Use the defaults for each tokenizers
-                    "return_overflowing_tokens": False,
-                    "return_special_tokens_mask": True,
-                    "return_offsets_mapping": False,  # Not implemented in python tokenizers
-                    # "add_special_tokens": False,
-                }
-                batch_kwargs = {
-                    "is_split_into_words": True,
-                    # "return_token_type_ids": True,  # Use the defaults for each tokenizers
-                    # "return_attention_mask": True,  # Use the defaults for each tokenizers
-                    "return_overflowing_tokens": False,
-                    "return_special_tokens_mask": True,
-                    "return_offsets_mapping": False,  # Not implemented in python tokenizers
-                    # "add_special_tokens": False,
-                }
-                # Test encode_plus for pretokenized inputs
-                output_r = tokenizer_r.encode_plus(pretokenized_input_simple, **kwargs)
-                output_p = tokenizer_p.encode_plus(pretokenized_input_simple, **kwargs)
-                for key in output_p.keys():
-                    self.assertEqual(output_p[key], output_r[key])
-
-                # Test batch_encode_plus for pretokenized inputs
-                input_batch = ([pretokenized_input_simple] * 2) + [pretokenized_input_simple + pretokenized_input_pair]
-                output_r = tokenizer_r.batch_encode_plus(input_batch, **batch_kwargs)
-                output_p = tokenizer_p.batch_encode_plus(input_batch, **batch_kwargs)
-                for key in output_p.keys():
-                    self.assertEqual(output_p[key], output_r[key])
-
-                # Test encode for pretokenized inputs pairs
-                output_r = tokenizer_r.encode(
-                    pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
-                )
-                output_p = tokenizer_p.encode(
-                    pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
-                )
-                self.assertEqual(output_p, output_r)
-
-                # Test encode_plus for pretokenized inputs
-                output_r = tokenizer_r.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
-                output_p = tokenizer_p.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
-                for key in output_p.keys():
-                    self.assertEqual(output_p[key], output_r[key])
-
-                # Test batch_encode_plus for pretokenized inputs
-                input_batch_pair = ([pretokenized_input_simple, pretokenized_input_pair] * 2) + [
-                    pretokenized_input_simple + pretokenized_input_pair,
-                    pretokenized_input_pair,
-                ]
-                output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **batch_kwargs)
-                output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **batch_kwargs)
-                for key in output_p.keys():
-                    self.assertEqual(output_p[key], output_r[key])
-
-    def test_create_token_type_ids(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                input_simple = [1, 2, 3]
-                input_pair = [1, 2, 3]
-
-                # Generate output
-                output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple)
-                output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple)
-                self.assertEqual(output_p, output_r)
-
-                # Generate pair output
-                output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple, input_pair)
-                output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple, input_pair)
-                self.assertEqual(output_p, output_r)
-
-    def test_build_inputs_with_special_tokens(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                # # Input string
-                # input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False)
-                # input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False)
-
-                # # Generate output
-                # output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
-                # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
-                # self.assertEqual(output_p, output_r)
-
-                # # Generate pair output
-                # output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
-                # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
-                # self.assertEqual(output_p, output_r)
-
-                input_pairs = [
-                    ("", ""),
-                    ("", "This is a sample pair"),
-                    ("This is a sample input", ""),
-                    ("This is a sample input", "This is a sample pair"),
-                ]
-
-                for sample_input, sample_pair in input_pairs:
-                    # Input tokens id
-                    input_simple = tokenizer_p.encode(sample_input, add_special_tokens=False)
-                    input_pair = tokenizer_p.encode(sample_pair, add_special_tokens=False)
-
-                    # Generate output
-                    output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
-                    output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
-                    self.assertEqual(output_p, output_r)
-
-                    # Generate pair output
-                    output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
-                    output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
-                    self.assertEqual(output_p, output_r)
-
-    def test_padding(self, max_length=50):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
-                pad_token_id = tokenizer_p.pad_token_id
-
-                # Encode - Simple input
-                input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
-                input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
-                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
-                input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length")
-                input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
-                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-                input_r = tokenizer_r.encode("This is a simple input", padding="longest")
-                input_p = tokenizer_p.encode("This is a simple input", padding=True)
-                self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
-
-                # Encode - Pair input
-                input_r = tokenizer_r.encode(
-                    "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
-                )
-                input_p = tokenizer_p.encode(
-                    "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
-                )
-                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
-                input_r = tokenizer_r.encode(
-                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
-                )
-                input_p = tokenizer_p.encode(
-                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
-                )
-                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
-                input_r = tokenizer_r.encode("This is a simple input", "This is a pair", padding=True)
-                input_p = tokenizer_p.encode("This is a simple input", "This is a pair", padding="longest")
-                self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
-
-                # Encode_plus - Simple input
-                input_r = tokenizer_r.encode_plus(
-                    "This is a simple input", max_length=max_length, pad_to_max_length=True
-                )
-                input_p = tokenizer_p.encode_plus(
-                    "This is a simple input", max_length=max_length, pad_to_max_length=True
-                )
-                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
-                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-                input_r = tokenizer_r.encode_plus(
-                    "This is a simple input", max_length=max_length, padding="max_length"
-                )
-                input_p = tokenizer_p.encode_plus(
-                    "This is a simple input", max_length=max_length, padding="max_length"
-                )
-                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
-                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-
-                input_r = tokenizer_r.encode_plus("This is a simple input", padding="longest")
-                input_p = tokenizer_p.encode_plus("This is a simple input", padding=True)
-                self.assert_padded_input_match(
-                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
-                )
-
-                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-
-                # Encode_plus - Pair input
-                input_r = tokenizer_r.encode_plus(
-                    "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
-                )
-                input_p = tokenizer_p.encode_plus(
-                    "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
-                )
-                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
-                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-                input_r = tokenizer_r.encode_plus(
-                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
-                )
-                input_p = tokenizer_p.encode_plus(
-                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
-                )
-                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
-                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-                input_r = tokenizer_r.encode_plus("This is a simple input", "This is a pair", padding="longest")
-                input_p = tokenizer_p.encode_plus("This is a simple input", "This is a pair", padding=True)
-                self.assert_padded_input_match(
-                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
-                )
-                self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-
-                # Batch_encode_plus - Simple input
-                input_r = tokenizer_r.batch_encode_plus(
-                    ["This is a simple input 1", "This is a simple input 2"],
-                    max_length=max_length,
-                    pad_to_max_length=True,
-                )
-                input_p = tokenizer_p.batch_encode_plus(
-                    ["This is a simple input 1", "This is a simple input 2"],
-                    max_length=max_length,
-                    pad_to_max_length=True,
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-                input_r = tokenizer_r.batch_encode_plus(
-                    ["This is a simple input 1", "This is a simple input 2"],
-                    max_length=max_length,
-                    padding="max_length",
-                )
-                input_p = tokenizer_p.batch_encode_plus(
-                    ["This is a simple input 1", "This is a simple input 2"],
-                    max_length=max_length,
-                    padding="max_length",
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-                input_r = tokenizer_r.batch_encode_plus(
-                    ["This is a simple input 1", "This is a simple input 2"],
-                    max_length=max_length,
-                    padding="longest",
-                )
-                input_p = tokenizer_p.batch_encode_plus(
-                    ["This is a simple input 1", "This is a simple input 2"],
-                    max_length=max_length,
-                    padding=True,
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
-
-                input_r = tokenizer_r.batch_encode_plus(
-                    ["This is a simple input 1", "This is a simple input 2"], padding="longest"
-                )
-                input_p = tokenizer_p.batch_encode_plus(
-                    ["This is a simple input 1", "This is a simple input 2"], padding=True
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
-
-                # Batch_encode_plus - Pair input
-                input_r = tokenizer_r.batch_encode_plus(
-                    [
-                        ("This is a simple input 1", "This is a simple input 2"),
-                        ("This is a simple pair 1", "This is a simple pair 2"),
-                    ],
-                    max_length=max_length,
-                    truncation=True,
-                    padding="max_length",
-                )
-                input_p = tokenizer_p.batch_encode_plus(
-                    [
-                        ("This is a simple input 1", "This is a simple input 2"),
-                        ("This is a simple pair 1", "This is a simple pair 2"),
-                    ],
-                    max_length=max_length,
-                    truncation=True,
-                    padding="max_length",
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-                input_r = tokenizer_r.batch_encode_plus(
-                    [
-                        ("This is a simple input 1", "This is a simple input 2"),
-                        ("This is a simple pair 1", "This is a simple pair 2"),
-                    ],
-                    padding=True,
-                )
-                input_p = tokenizer_p.batch_encode_plus(
-                    [
-                        ("This is a simple input 1", "This is a simple input 2"),
-                        ("This is a simple pair 1", "This is a simple pair 2"),
-                    ],
-                    padding="longest",
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
-
-                # Using pad on single examples after tokenization
-                input_r = tokenizer_r.encode_plus("This is a input 1")
-                input_r = tokenizer_r.pad(input_r)
-
-                input_p = tokenizer_p.encode_plus("This is a input 1")
-                input_p = tokenizer_p.pad(input_p)
-
-                self.assert_padded_input_match(
-                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
-                )
-
-                # Using pad on single examples after tokenization
-                input_r = tokenizer_r.encode_plus("This is a input 1")
-                input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
-
-                input_p = tokenizer_p.encode_plus("This is a input 1")
-                input_p = tokenizer_p.pad(input_p, max_length=max_length, padding="max_length")
-
-                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
-
-                # Using pad after tokenization
-                input_r = tokenizer_r.batch_encode_plus(
-                    ["This is a input 1", "This is a much longer input whilch should be padded"]
-                )
-                input_r = tokenizer_r.pad(input_r)
-
-                input_p = tokenizer_p.batch_encode_plus(
-                    ["This is a input 1", "This is a much longer input whilch should be padded"]
-                )
-                input_p = tokenizer_p.pad(input_p)
-
-                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
-
-                # Using pad after tokenization
-                input_r = tokenizer_r.batch_encode_plus(
-                    ["This is a input 1", "This is a much longer input whilch should be padded"]
-                )
-                input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
-
-                input_p = tokenizer_p.batch_encode_plus(
-                    ["This is a input 1", "This is a much longer input whilch should be padded"]
-                )
-                input_p = tokenizer_p.pad(input_p, max_length=max_length, padding="max_length")
-                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-                # Test padding nested empty lists (in some use-cases, there is no any token id in the `input_ids` list).
-                input_r = tokenizer_r.pad({"input_ids": [[], []]}, max_length=max_length, padding="max_length")
-                input_p = tokenizer_p.pad({"input_ids": [[], []]}, max_length=max_length, padding="max_length")
-                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-    def test_padding_different_model_input_name(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
-                pad_token_id = tokenizer_p.pad_token_id
-
-                input_r = tokenizer_r.batch_encode_plus(
-                    ["This is a input 1", "This is a much longer input whilch should be padded"]
-                )
-                input_p = tokenizer_r.batch_encode_plus(
-                    ["This is a input 1", "This is a much longer input whilch should be padded"]
-                )
-
-                # rename encoded batch to "inputs"
-                input_r["inputs"] = input_r[tokenizer_r.model_input_names[0]]
-                del input_r[tokenizer_r.model_input_names[0]]
-
-                input_p["inputs"] = input_p[tokenizer_p.model_input_names[0]]
-                del input_p[tokenizer_p.model_input_names[0]]
-
-                # Renaming `input_ids` to `inputs`
-                tokenizer_r.model_input_names = ["inputs"] + tokenizer_r.model_input_names[1:]
-                tokenizer_p.model_input_names = ["inputs"] + tokenizer_p.model_input_names[1:]
-
-                input_r = tokenizer_r.pad(input_r, padding="longest")
-                input_p = tokenizer_r.pad(input_p, padding="longest")
-
-                max_length = len(input_p["inputs"][0])
-                self.assert_batch_padded_input_match(
-                    input_r, input_p, max_length, pad_token_id, model_main_input_name="inputs"
-                )
-
-    def test_save_pretrained(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # make sure that all ".json" files are saved in the correct format
-                for file_path in tokenizer_r_files + tokenizer_p_files:
-                    if os.path.exists(file_path) and file_path.endswith(".json"):
-                        check_json_file_has_correct_format(file_path)
-
-                # Checks it save with the same files + the tokenizer.json file for the fast one
-                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
-                tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
-                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-                    # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
-                    # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
-
-                shutil.rmtree(tmpdirname2)
-
-                # Save tokenizer rust, legacy_format=True
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it save with the same files
-                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
-                # Save tokenizer rust, legacy_format=False
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it saved the tokenizer.json file
-                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
-    def test_embeded_special_tokens(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                sentence = "A, <mask> AllenNLP sentence."
-                tokens_r = tokenizer_r.encode_plus(
-                    sentence,
-                    add_special_tokens=True,
-                )
-                tokens_p = tokenizer_p.encode_plus(
-                    sentence,
-                    add_special_tokens=True,
-                )
-
-                for key in tokens_p.keys():
-                    self.assertEqual(tokens_r[key], tokens_p[key])
-
-                if "token_type_ids" in tokens_r:
-                    self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
-
-                tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
-                tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
-                self.assertSequenceEqual(tokens_r, tokens_p)
-
-    def test_compare_add_special_tokens(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
-                # pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
-
-                for text in ["", " "]:
-                    # tokenize()
-                    no_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=False)
-                    with_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=True)
-                    self.assertEqual(
-                        len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
-                    )
-
-                    # encode()
-                    no_special_tokens = tokenizer_r.encode(text, add_special_tokens=False)
-                    with_special_tokens = tokenizer_r.encode(text, add_special_tokens=True)
-                    self.assertEqual(
-                        len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
-                    )
-
-                    # encode_plus()
-                    no_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=False)
-                    with_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=True)
-                    for key in no_special_tokens.keys():
-                        self.assertEqual(
-                            len(no_special_tokens[key]),
-                            len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
-                        )
-
-                    # # batch_encode_plus
-                    no_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=False)
-                    with_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=True)
-                    for key in no_special_tokens.keys():
-                        for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
-                            self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
-
-    def test_compare_prepare_for_model(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            return
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                string_sequence = "Asserting that both tokenizers are equal"
-                python_output = tokenizer_p.prepare_for_model(
-                    tokenizer_p.encode(string_sequence, add_special_tokens=False)
-                )
-                rust_output = tokenizer_r.prepare_for_model(
-                    tokenizer_r.encode(string_sequence, add_special_tokens=False)
-                )
-                for key in python_output:
-                    self.assertEqual(python_output[key], rust_output[key])
-
-    def test_special_tokens_initialization(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                added_tokens = [AddedToken("<special>", lstrip=True)]
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                )
-                r_output = tokenizer_r.encode("Hey this is a <special> token")
-
-                special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0]
-
-                self.assertTrue(special_token_id in r_output)
-
-                if self.test_slow_tokenizer:
-                    # in rust fast, you lose the information of the AddedToken when initializing with `additional_special_tokens`
-                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
-                    )
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                    )
-
-                    p_output = tokenizer_p.encode("Hey this is a <special> token")
-
-                    cr_output = tokenizer_cr.encode("Hey this is a <special> token")
-
-                    self.assertEqual(p_output, r_output)
-                    self.assertEqual(cr_output, r_output)
-                    self.assertTrue(special_token_id in p_output)
-                    self.assertTrue(special_token_id in cr_output)
-
-    def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
-        # This test no longer support rust tokenizers, because the only file that should be looked
-        # at by the fast tokenizer with the new saving format is `tokenizer_config.json`.
-        # The previous behaviour is very strange too. Fast tokenizer should not save 3 files, but just one. Can never do slow from fast.
-        tokenizer_list = []
-        if self.test_slow_tokenizer:
-            tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
-
-        for tokenizer_class, tokenizer_utils in tokenizer_list:
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                tokenizer_utils.save_pretrained(tmp_dir)
-                # only legacy save will check this
-                tokenizer_path = "tokenizer_config.json"
-                with open(os.path.join(tmp_dir, tokenizer_path), encoding="utf-8") as json_file:
-                    tokenizer_config = json.load(json_file)
-
-                tokenizer_config["additional_special_tokens"] = ["an_additional_special_token"]
-
-                with open(os.path.join(tmp_dir, tokenizer_path), "w", encoding="utf-8") as outfile:
-                    json.dump(tokenizer_config, outfile)
-
-                # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
-                # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
-                # "special_tokens_map.json" files
-
-                # TODO ArthurZ ... Ok so for legacy we have to support this I guess..... (special_tokens_map + additional)
-                tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir)
-                self.assertIn(
-                    "an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
-                )
-                self.assertIn("an_additional_special_token", tokenizer_without_change_in_init.get_vocab())
-                self.assertEqual(
-                    ["an_additional_special_token"],
-                    tokenizer_without_change_in_init.convert_ids_to_tokens(
-                        tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"])
-                    ),
-                )
-
-                # Now we test that we can change the value of additional_special_tokens in the from_pretrained
-                new_added_tokens = [AddedToken("a_new_additional_special_token", lstrip=True)]
-                tokenizer = tokenizer_class.from_pretrained(
-                    tmp_dir,
-                    additional_special_tokens=new_added_tokens,
-                )
-
-                self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens)
-                self.assertEqual(
-                    ["a_new_additional_special_token"],
-                    tokenizer.convert_ids_to_tokens(
-                        tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"])
-                    ),
-                )
-
-    def test_training_new_tokenizer(self):
-        # This feature only exists for fast tokenizers
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_rust_tokenizer()
-        new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
-
-        # Test we can use the new tokenizer with something not seen during training
-        inputs = new_tokenizer(["This is the first sentence", "This sentence is different 🤗."])
-        self.assertEqual(len(inputs["input_ids"]), 2)
-        decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
-        expected_result = "This is the first sentence"
-
-        if tokenizer.backend_tokenizer.normalizer is not None:
-            expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
-        self.assertEqual(expected_result, decoded_input)
-
-        # We check that the parameters of the tokenizer remained the same
-        # Check we have the same number of added_tokens for both pair and non-pair inputs.
-        self.assertEqual(tokenizer.num_special_tokens_to_add(False), new_tokenizer.num_special_tokens_to_add(False))
-        self.assertEqual(tokenizer.num_special_tokens_to_add(True), new_tokenizer.num_special_tokens_to_add(True))
-
-        # Check we have the correct max_length for both pair and non-pair inputs.
-        self.assertEqual(tokenizer.max_len_single_sentence, new_tokenizer.max_len_single_sentence)
-        self.assertEqual(tokenizer.max_len_sentences_pair, new_tokenizer.max_len_sentences_pair)
-
-        # Assert the set of special tokens match as we didn't ask to change them
-        self.assertSequenceEqual(
-            tokenizer.all_special_tokens_extended,
-            new_tokenizer.all_special_tokens_extended,
-        )
-
-        self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map)
-
-    def test_training_new_tokenizer_with_special_tokens_change(self):
-        # This feature only exists for fast tokenizers
-        if not self.test_rust_tokenizer:
-            return
-
-        tokenizer = self.get_rust_tokenizer()
-        # Test with a special tokens map
-        class_signature = inspect.signature(tokenizer.__class__)
-        if "cls_token" in class_signature.parameters:
-            new_tokenizer = tokenizer.train_new_from_iterator(
-                SMALL_TRAINING_CORPUS, 100, special_tokens_map={tokenizer.cls_token: "<cls>"}
-            )
-            cls_id = new_tokenizer.get_vocab()["<cls>"]
-            self.assertEqual(new_tokenizer.cls_token, "<cls>")
-            self.assertEqual(new_tokenizer.cls_token_id, cls_id)
-
-        # Create a new mapping from the special tokens defined in the original tokenizer
-        special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy()
-        special_tokens_list.remove("additional_special_tokens")
-        special_tokens_map = {}
-        for token in special_tokens_list:
-            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is not None:
-                special_token = getattr(tokenizer, token)
-                special_tokens_map[special_token] = f"{special_token}a"
-
-        # Train new tokenizer
-        new_tokenizer = tokenizer.train_new_from_iterator(
-            SMALL_TRAINING_CORPUS, 100, special_tokens_map=special_tokens_map
-        )
-
-        # Check the changes
-        for token in special_tokens_list:
-            # Get the private one to avoid unnecessary warnings.
-            if getattr(tokenizer, f"_{token}") is None:
-                continue
-            special_token = getattr(tokenizer, token)
-            if special_token in special_tokens_map:
-                new_special_token = getattr(new_tokenizer, token)
-                self.assertEqual(special_tokens_map[special_token], new_special_token)
-
-                new_id = new_tokenizer.get_vocab()[new_special_token]
-                self.assertEqual(getattr(new_tokenizer, f"{token}_id"), new_id)
-
-        # Check if the AddedToken / string format has been kept
-        for special_token in tokenizer.all_special_tokens_extended:
-            if isinstance(special_token, AddedToken) and special_token.content not in special_tokens_map:
-                # The special token must appear identically in the list of the new tokenizer.
-                self.assertTrue(
-                    special_token in new_tokenizer.all_special_tokens_extended,
-                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
-                )
-            elif isinstance(special_token, AddedToken):
-                # The special token must appear in the list of the new tokenizer as an object of type AddedToken with
-                # the same parameters as the old AddedToken except the content that the user has requested to change.
-                special_token_str = special_token.content
-                new_special_token_str = special_tokens_map[special_token_str]
-
-                find = False
-                for candidate in new_tokenizer.all_special_tokens_extended:
-                    if (
-                        isinstance(candidate, AddedToken)
-                        and candidate.content == new_special_token_str
-                        and candidate.lstrip == special_token.lstrip
-                        and candidate.rstrip == special_token.rstrip
-                        and candidate.normalized == special_token.normalized
-                        and candidate.single_word == special_token.single_word
-                    ):
-                        find = True
-                        break
-                special_token.content = new_special_token_str
-                self.assertTrue(
-                    find,
-                    f"'{special_token.__repr__()}' should appear as an `AddedToken` in the all_special_tokens_extended = "
-                    f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k)==new_special_token_str]} but it is missing"
-                    ", this means that the new tokenizers did not keep the `rstrip`, `lstrip`, `normalized` etc attributes.",
-                )
-            elif special_token not in special_tokens_map:
-                # The special token must appear identically in the list of the new tokenizer.
-                self.assertTrue(
-                    special_token in new_tokenizer.all_special_tokens_extended,
-                    f"'{special_token.__repr__()}' should be in {new_tokenizer.all_special_tokens_extended}",
-                )
-
-            else:
-                # The special token must appear in the list of the new tokenizer as an object of type string.
-                self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens_extended)
-
-        # Test we can use the new tokenizer with something not seen during training
-        inputs = new_tokenizer(["This is the first sentence", "This sentence is different 🤗."])
-        self.assertEqual(len(inputs["input_ids"]), 2)
-        decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
-        expected_result = "This is the first sentence"
-
-        if tokenizer.backend_tokenizer.normalizer is not None:
-            expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
-        self.assertEqual(expected_result, decoded_input)
-
-    @unittest.skip('not support yet.')
-    def test_saving_tokenizer_trainer(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                with tempfile.TemporaryDirectory() as tmp_dir:
-                    # Save the fast tokenizer files in a temporary directory
-                    tokenizer_old = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs, use_fast=True)
-                    tokenizer_old.save_pretrained(tmp_dir, legacy_format=False)  # save only fast version
-
-                    # Initialize toy model for the trainer
-                    model = nn.Dense(1, 1)
-
-                    # Load tokenizer from a folder without legacy files
-                    tokenizer = self.rust_tokenizer_class.from_pretrained(tmp_dir)
-                    training_args = TrainingArguments(output_dir=tmp_dir, do_train=True, no_cuda=True)
-                    trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer)
-
-                    # Should not raise an error
-                    trainer.save_model(os.path.join(tmp_dir, "checkpoint"))
-                    self.assertIn("tokenizer.json", os.listdir(os.path.join(tmp_dir, "checkpoint")))
-
-    def test_convert_tokens_to_string_format(self):
-        tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                tokens = ["this", "is", "a", "test"]
-                string = tokenizer.convert_tokens_to_string(tokens)
-
-                self.assertIsInstance(string, str)
-
-    def test_save_slow_from_fast_and_reload_fast(self):
-        if not self.test_slow_tokenizer or not self.test_rust_tokenizer:
-            # we need both slow and fast versions
-            return
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                with tempfile.TemporaryDirectory() as tmp_dir_1:
-                    # Here we check that even if we have initialized a fast tokenizer with a tokenizer_file we can
-                    # still save only the slow version and use these saved files to rebuild a tokenizer
-                    tokenizer_fast_old_1 = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, **kwargs, use_fast=True
-                    )
-                    tokenizer_file = os.path.join(tmp_dir_1, "tokenizer.json")
-                    tokenizer_fast_old_1.backend_tokenizer.save(tokenizer_file)
-
-                    tokenizer_fast_old_2 = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, **kwargs, use_fast=True, tokenizer_file=tokenizer_file
-                    )
-
-                    tokenizer_fast_old_2.save_pretrained(tmp_dir_1, legacy_format=True)  # save only slow version
-
-                    tokenizer_slow = self.tokenizer_class.from_pretrained(tmp_dir_1)
-                with tempfile.TemporaryDirectory() as tmp_dir_2:
-                    tokenizer_slow.save_pretrained(tmp_dir_2)
-
-                    # Should not raise an error
-                    self.rust_tokenizer_class.from_pretrained(tmp_dir_2)
-
-    # TODO This is ran for all models but only tests bert...
-    def test_clean_up_tokenization_spaces(self):
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-        assert tokenizer.clean_up_tokenization_spaces is True
-
-        tokens = tokenizer.encode("This shouldn't be! He'll go.")
-        decoded = tokenizer.decode(tokens)
-        assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
-
-        tokenizer.clean_up_tokenization_spaces = False
-        decoded = tokenizer.decode(tokens)
-        assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
-        assert decoded == tokenizer.decode(tokens, clean_up_tokenization_spaces=False)
-
-        # Fast from slow
-        with tempfile.TemporaryDirectory() as tmp_dir_2:
-            tokenizer.save_pretrained(tmp_dir_2)
-            tokenizer_fast = BertTokenizerFast.from_pretrained(tmp_dir_2)
-            del tokenizer
-
-        assert tokenizer_fast.clean_up_tokenization_spaces is False
-        decoded = tokenizer_fast.decode(tokens)
-        # fast and slow don't have the same output when we don't cleanup
-        # tokenization space. Here `be!` vs `be !` and `go.` vs `go .`
-        assert decoded == "[CLS] this shouldn ' t be! he ' ll go. [SEP]"
-
-        tokenizer_fast.clean_up_tokenization_spaces = True
-        assert tokenizer_fast.clean_up_tokenization_spaces is True
-
-        decoded = tokenizer_fast.decode(tokens)
-        assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
-
-        # Slow from fast
-        with tempfile.TemporaryDirectory() as tmp_dir_2:
-            tokenizer_fast.clean_up_tokenization_spaces = False
-            tokenizer_fast.save_pretrained(tmp_dir_2)
-            tokenizer = BertTokenizer.from_pretrained(tmp_dir_2)
-
-        assert tokenizer.clean_up_tokenization_spaces is False
-        decoded = tokenizer.decode(tokens)
-        assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
-
-        tokenizer.clean_up_tokenization_spaces = True
-        decoded = tokenizer.decode(tokens)
-        assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
-
-    def test_split_special_tokens(self):
-        if not self.test_slow_tokenizer:
-            return
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            special_token = "[SPECIAL_TOKEN]"
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-                if not tokenizer.is_fast:
-                    # bloom, gptneox etc only have a fast
-                    tokenizer.add_special_tokens(
-                        {
-                            "additional_special_tokens": [
-                                AddedToken(special_token, rstrip=True, lstrip=True, normalized=True, special=True)
-                            ]
-                        }
-                    )
-                    encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
-                    self.assertEqual(len(encoded_special_token), 1)
-
-                    encoded_split_special_token = tokenizer.encode(
-                        special_token, add_special_tokens=False, split_special_tokens=True
-                    )
-                    if len(encoded_split_special_token) == 1:
-                        # if we have subword tokenization or special vocab
-                        self.assertTrue(
-                            encoded_split_special_token[0] != tokenizer.convert_tokens_to_ids(special_token)
-                        )
-                    else:
-                        self.assertTrue(len(encoded_split_special_token) > 1)
-
-    def test_added_tokens_serialization(self):
-        # Utility to test the added vocab
-        def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir):
-            tokenizer = tokenizer_class.from_pretrained(temp_dir)
-            self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens)
-            self.assertIn(new_eos, tokenizer.added_tokens_decoder.values())
-            self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos)
-            self.assertDictEqual(expected, tokenizer.added_tokens_decoder)
-            return tokenizer
-
-        new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True)
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                # Load a slow tokenizer from the hub, init with the new token for fast to also include it
-                tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
-                EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
-                with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
-                    self.assertEqual(tokenizer._eos_token, new_eos)
-                    self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values()))
-
-                with tempfile.TemporaryDirectory() as tmp_dir_2:
-                    tokenizer.save_pretrained(tmp_dir_2)
-                    with self.subTest(
-                        "Hub -> Slow -> Slow: Test saving this slow tokenizer and reloading it in the fast class"
-                    ):
-                        _test_added_vocab_and_eos(
-                            EXPECTED_ADDED_TOKENS_DECODER, self.tokenizer_class, new_eos, tmp_dir_2
-                        )
-
-                    if self.rust_tokenizer_class is not None:
-                        with self.subTest(
-                            "Hub -> Slow -> Fast: Test saving this slow tokenizer and reloading it in the fast class"
-                        ):
-                            tokenizer_fast = _test_added_vocab_and_eos(
-                                EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_2
-                            )
-                            with tempfile.TemporaryDirectory() as tmp_dir_3:
-                                tokenizer_fast.save_pretrained(tmp_dir_3)
-                                with self.subTest(
-                                    "Hub -> Slow -> Fast -> Fast: Test saving this fast tokenizer and reloading it in the fast class"
-                                ):
-                                    _test_added_vocab_and_eos(
-                                        EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_3
-                                    )
-
-                                with self.subTest(
-                                    "Hub -> Slow -> Fast -> Slow: Test saving this slow tokenizer and reloading it in the slow class"
-                                ):
-                                    _test_added_vocab_and_eos(
-                                        EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_3
-                                    )
-
-                with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
-                    if self.rust_tokenizer_class is not None:
-                        tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
-                        self.assertEqual(tokenizer_fast._eos_token, new_eos)
-                        self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
-                        # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
-                        with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
-                            self.assertDictEqual(EXPECTED_ADDED_TOKENS_DECODER, tokenizer_fast.added_tokens_decoder)
-
-                        EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder
-                        with tempfile.TemporaryDirectory() as tmp_dir_4:
-                            tokenizer_fast.save_pretrained(tmp_dir_4)
-                            with self.subTest("Hub -> Fast -> Fast: saving Fast1 locally and loading"):
-                                _test_added_vocab_and_eos(
-                                    EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_4
-                                )
-
-                            with self.subTest("Hub -> Fast -> Slow: saving Fast1 locally and loading"):
-                                _test_added_vocab_and_eos(
-                                    EXPECTED_ADDED_TOKENS_DECODER, self.tokenizer_class, new_eos, tmp_dir_4
-                                )